File size: 1,976 Bytes
0c2f228
ef4b9d8
0c2f228
e8f9c74
 
 
 
 
 
 
 
c64ca36
e8f9c74
94e0305
6247742
 
 
098d68e
e8f9c74
 
 
 
d6a2d06
098d68e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7af941
cc8179b
d6a2d06
 
99e31bc
a97d8ed
c85ccee
a97d8ed
 
8a6ae20
372ceb3
00d0134
a97d8ed
3f53296
 
 
ceacc53
d6a2d06
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
os.system('pip install gradio --upgrade')
os.system('pip freeze')
import time
import torch
import string
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.asr_inference import Speech2Text


import soundfile
import librosa
import matplotlib.pyplot as plt


import gradio as gr



def text_normalizer(text):
    text = text.upper()
    return text.translate(str.maketrans('', '', string.punctuation))

def inference(audio, model):
  lang = 'multilingual'
  fs = 16000 
  tag = model
    
  d = ModelDownloader()
  speech2text = Speech2Text(
        **d.download_and_unpack(tag),
        device="cpu",
        minlenratio=0.0,
        maxlenratio=0.0,
        ctc_weight=0.3,
        beam_size=10,
        batch_size=0,
        nbest=1
  )
  speech, rate = librosa.load(audio.name, sr=16000)
  assert rate == fs, "mismatch in sampling rate"
  nbests = speech2text(speech)
  text, *_ = nbests[0]
  return f"ASR hypothesis: {text_normalizer(text)}"
  
inputs = [gr.inputs.Audio(label="Input Audio", type="file"),gr.inputs.Dropdown(choices=["ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best","Shinji Watanabe/spgispeech_asr_train_asr_conformer6_n_fft512_hop_length256_raw_en_unnorm_bpe5000_valid.acc.ave"], type="value", default="ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best", label="model")]
outputs =  gr.outputs.Textbox(label="Output Text")

title = "ESPnet2-ASR"
description = "Gradio demo for Real-time ASR with ESPnet2. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://espnet.github.io/espnet/'>ESPnet: end-to-end speech processing toolkit</a> | <a href='https://github.com/espnet/espnet'>Github Repo</a></p>"

examples = [
    ["poem.wav"]
]
gr.Interface(inference, inputs, outputs, title=title, description=description, article=article,examples=examples, enable_queue=True).launch()