Spaces:
Runtime error
Runtime error
jaekookang
commited on
Commit
•
4a43745
1
Parent(s):
f396296
first upload
Browse files- README.md +2 -2
- examples/gentleman_16000.wav +0 -0
- examples/jaekoo_numbers.wav +0 -0
- examples/maybe_next_time.wav +0 -0
- gradio_asr_en_libri100_word_vs_bpe.py +79 -0
- packages.txt +2 -0
- requirements.txt +6 -0
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 💩
|
4 |
colorFrom: red
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
-
app_file:
|
8 |
pinned: false
|
9 |
license: mit
|
10 |
---
|
|
|
1 |
---
|
2 |
+
title: ESPNet2 ASR Librispeech word vs bpe tokens
|
3 |
emoji: 💩
|
4 |
colorFrom: red
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
app_file: gradio_asr_en_libri100_word_vs_bpe.py
|
8 |
pinned: false
|
9 |
license: mit
|
10 |
---
|
examples/gentleman_16000.wav
ADDED
Binary file (111 kB). View file
|
|
examples/jaekoo_numbers.wav
ADDED
Binary file (218 kB). View file
|
|
examples/maybe_next_time.wav
ADDED
Binary file (25.7 kB). View file
|
|
gradio_asr_en_libri100_word_vs_bpe.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''Librispeech 100h English ASR demo
|
2 |
+
@ML2 --> @HuggingFace
|
3 |
+
|
4 |
+
2022-02-23 jkang first created
|
5 |
+
'''
|
6 |
+
|
7 |
+
import os
|
8 |
+
from glob import glob
|
9 |
+
from loguru import logger
|
10 |
+
# import soundfile as sf
|
11 |
+
import librosa
|
12 |
+
# from scipy.io import wavfile
|
13 |
+
import gradio as gr
|
14 |
+
|
15 |
+
# from espnet_model_zoo.downloader import ModelDownloader
|
16 |
+
from espnet2.bin.asr_inference import Speech2Text
|
17 |
+
|
18 |
+
# ---------- Settings ----------
|
19 |
+
GPU_ID = '-1'
|
20 |
+
os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
|
21 |
+
DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu'
|
22 |
+
|
23 |
+
# SERVER_PORT = 42208
|
24 |
+
# SERVER_NAME = "0.0.0.0"
|
25 |
+
|
26 |
+
MODEL_DIR = './model'
|
27 |
+
|
28 |
+
EXAMPLE_DIR = './examples'
|
29 |
+
examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))
|
30 |
+
|
31 |
+
# ---------- Logging ----------
|
32 |
+
logger.add('app.log', mode='a')
|
33 |
+
logger.info('============================= App restarted =============================')
|
34 |
+
|
35 |
+
# ---------- Model ----------
|
36 |
+
logger.info('download model')
|
37 |
+
logger.info('model downloaded')
|
38 |
+
model_word = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_word")
|
39 |
+
model_bpe = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer")
|
40 |
+
logger.info('model loaded')
|
41 |
+
|
42 |
+
def predict(wav_file):
|
43 |
+
logger.info('wav file loaded')
|
44 |
+
# Load audio
|
45 |
+
speech, rate = librosa.load(wav_file, sr=16000)
|
46 |
+
# Run inference
|
47 |
+
W = model_word(speech)[0]
|
48 |
+
B = model_bpe(speech)[0]
|
49 |
+
|
50 |
+
logger.info('predicted')
|
51 |
+
return W[0], B[0]
|
52 |
+
|
53 |
+
iface = gr.Interface(
|
54 |
+
predict,
|
55 |
+
title='Comparison between word vs BPE tokens based on ESPNet2 ASR models',
|
56 |
+
description='Two models were trained on Librispeech (clean-100h)',
|
57 |
+
inputs=[
|
58 |
+
gr.inputs.Audio(label='wav file', source='microphone', type='filepath')
|
59 |
+
],
|
60 |
+
outputs=[
|
61 |
+
gr.outputs.Textbox(label='decoding result (word-token model)'),
|
62 |
+
gr.outputs.Textbox(label='decoding result (BPE-token model)'),
|
63 |
+
],
|
64 |
+
examples=examples,
|
65 |
+
# article='<p style="text-align:center">Model URL<a target="_blank" href="https://huggingface.co/jkang/espnet2_librispeech_100_conformer">🤗</a></p>',
|
66 |
+
)
|
67 |
+
|
68 |
+
if __name__ == '__main__':
|
69 |
+
try:
|
70 |
+
iface.launch(debug=True,
|
71 |
+
# server_name=SERVER_NAME,
|
72 |
+
# server_port=SERVER_PORT,
|
73 |
+
enable_queue=True,
|
74 |
+
)
|
75 |
+
except KeyboardInterrupt as e:
|
76 |
+
print(e)
|
77 |
+
|
78 |
+
finally:
|
79 |
+
iface.close()
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
cmake
|
2 |
+
libsndfile1
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
espnet==0.10.6
|
2 |
+
espnet_model_zoo==0.1.7
|
3 |
+
gradio
|
4 |
+
loguru==0.6.0
|
5 |
+
librosa
|
6 |
+
soundfile
|