Spaces:
Runtime error
Runtime error
Ahsen Khaliq
commited on
Commit
•
e8f9c74
1
Parent(s):
db99e2a
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import torch
|
3 |
+
import string
|
4 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
5 |
+
from espnet2.bin.asr_inference import Speech2Text
|
6 |
+
|
7 |
+
|
8 |
+
import soundfile
|
9 |
+
import librosa.display
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
import gradio as gr
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
def inference(audio)
|
16 |
+
speech, rate = soundfile.read(audio.name)
|
17 |
+
assert rate == fs, "mismatch in sampling rate"
|
18 |
+
nbests = speech2text(speech)
|
19 |
+
text, *_ = nbests[0]
|
20 |
+
|
21 |
+
print(f"Input Speech: {file_name}")
|
22 |
+
display(Audio(speech, rate=rate))
|
23 |
+
librosa.display.waveplot(speech, sr=rate)
|
24 |
+
plt.show()
|
25 |
+
print(f"ASR hypothesis: {text_normalizer(text)}")
|
26 |
+
print("*" * 50)
|
27 |
+
|
28 |
+
|
29 |
+
d = ModelDownloader()
|
30 |
+
# It may takes a while to download and build models
|
31 |
+
speech2text = Speech2Text(
|
32 |
+
**d.download_and_unpack(tag),
|
33 |
+
device="cpu",
|
34 |
+
minlenratio=0.0,
|
35 |
+
maxlenratio=0.0,
|
36 |
+
ctc_weight=0.3,
|
37 |
+
beam_size=10,
|
38 |
+
batch_size=0,
|
39 |
+
nbest=1
|
40 |
+
)
|
41 |
+
|
42 |
+
def text_normalizer(text):
|
43 |
+
text = text.upper()
|
44 |
+
return text.translate(str.maketrans('', '', string.punctuation))
|
45 |
+
|
46 |
+
lang = 'multilingual'
|
47 |
+
fs = 16000
|
48 |
+
tag = 'ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best'
|