Ahsen Khaliq commited on
Commit
e8f9c74
1 Parent(s): db99e2a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -0
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import torch
3
+ import string
4
+ from espnet_model_zoo.downloader import ModelDownloader
5
+ from espnet2.bin.asr_inference import Speech2Text
6
+
7
+
8
+ import soundfile
9
+ import librosa.display
10
+ import matplotlib.pyplot as plt
11
+ import gradio as gr
12
+
13
+
14
+
15
+ def inference(audio)
16
+ speech, rate = soundfile.read(audio.name)
17
+ assert rate == fs, "mismatch in sampling rate"
18
+ nbests = speech2text(speech)
19
+ text, *_ = nbests[0]
20
+
21
+ print(f"Input Speech: {file_name}")
22
+ display(Audio(speech, rate=rate))
23
+ librosa.display.waveplot(speech, sr=rate)
24
+ plt.show()
25
+ print(f"ASR hypothesis: {text_normalizer(text)}")
26
+ print("*" * 50)
27
+
28
+
29
+ d = ModelDownloader()
30
+ # It may takes a while to download and build models
31
+ speech2text = Speech2Text(
32
+ **d.download_and_unpack(tag),
33
+ device="cpu",
34
+ minlenratio=0.0,
35
+ maxlenratio=0.0,
36
+ ctc_weight=0.3,
37
+ beam_size=10,
38
+ batch_size=0,
39
+ nbest=1
40
+ )
41
+
42
+ def text_normalizer(text):
43
+ text = text.upper()
44
+ return text.translate(str.maketrans('', '', string.punctuation))
45
+
46
+ lang = 'multilingual'
47
+ fs = 16000
48
+ tag = 'ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best'