import os os.system('pip install gradio --upgrade') os.system('pip freeze') import time import torch import string from espnet_model_zoo.downloader import ModelDownloader from espnet2.bin.asr_inference import Speech2Text import soundfile import librosa import matplotlib.pyplot as plt import gradio as gr def text_normalizer(text): text = text.upper() return text.translate(str.maketrans('', '', string.punctuation)) def inference(audio, model): lang = 'multilingual' fs = 16000 tag = model d = ModelDownloader() speech2text = Speech2Text( **d.download_and_unpack(tag), device="cpu", minlenratio=0.0, maxlenratio=0.0, ctc_weight=0.3, beam_size=10, batch_size=0, nbest=1 ) speech, rate = librosa.load(audio.name, sr=16000) assert rate == fs, "mismatch in sampling rate" nbests = speech2text(speech) text, *_ = nbests[0] return f"ASR hypothesis: {text_normalizer(text)}" inputs = [gr.inputs.Audio(label="Input Audio", type="file"),gr.inputs.Dropdown(choices=["ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best","Shinji Watanabe/spgispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_unnorm_bpe5000_valid.acc.ave"], type="value", default="ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best", label="model")] outputs = gr.outputs.Textbox(label="Output Text") title = "ESPnet2-ASR" description = "Gradio demo for Real-time ASR with ESPnet2. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below." article = "

ESPnet: end-to-end speech processing toolkit | Github Repo

" examples = [ ["poem.wav"] ] gr.Interface(inference, inputs, outputs, title=title, description=description, article=article,examples=examples, enable_queue=True).launch()