jaekookang commited on
Commit
4a43745
1 Parent(s): f396296

first upload

Browse files
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Espnet2_librispeech_100h_word_vs_bpe
3
  emoji: 💩
4
  colorFrom: red
5
  colorTo: green
6
  sdk: gradio
7
- app_file: app.py
8
  pinned: false
9
  license: mit
10
  ---
 
1
  ---
2
+ title: ESPNet2 ASR Librispeech word vs bpe tokens
3
  emoji: 💩
4
  colorFrom: red
5
  colorTo: green
6
  sdk: gradio
7
+ app_file: gradio_asr_en_libri100_word_vs_bpe.py
8
  pinned: false
9
  license: mit
10
  ---
examples/gentleman_16000.wav ADDED
Binary file (111 kB). View file
 
examples/jaekoo_numbers.wav ADDED
Binary file (218 kB). View file
 
examples/maybe_next_time.wav ADDED
Binary file (25.7 kB). View file
 
gradio_asr_en_libri100_word_vs_bpe.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''Librispeech 100h English ASR demo
2
+ @ML2 --> @HuggingFace
3
+
4
+ 2022-02-23 jkang first created
5
+ '''
6
+
7
+ import os
8
+ from glob import glob
9
+ from loguru import logger
10
+ # import soundfile as sf
11
+ import librosa
12
+ # from scipy.io import wavfile
13
+ import gradio as gr
14
+
15
+ # from espnet_model_zoo.downloader import ModelDownloader
16
+ from espnet2.bin.asr_inference import Speech2Text
17
+
18
+ # ---------- Settings ----------
19
+ GPU_ID = '-1'
20
+ os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
21
+ DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu'
22
+
23
+ # SERVER_PORT = 42208
24
+ # SERVER_NAME = "0.0.0.0"
25
+
26
+ MODEL_DIR = './model'
27
+
28
+ EXAMPLE_DIR = './examples'
29
+ examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))
30
+
31
+ # ---------- Logging ----------
32
+ logger.add('app.log', mode='a')
33
+ logger.info('============================= App restarted =============================')
34
+
35
+ # ---------- Model ----------
36
+ logger.info('download model')
37
+ logger.info('model downloaded')
38
+ model_word = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_word")
39
+ model_bpe = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer")
40
+ logger.info('model loaded')
41
+
42
+ def predict(wav_file):
43
+ logger.info('wav file loaded')
44
+ # Load audio
45
+ speech, rate = librosa.load(wav_file, sr=16000)
46
+ # Run inference
47
+ W = model_word(speech)[0]
48
+ B = model_bpe(speech)[0]
49
+
50
+ logger.info('predicted')
51
+ return W[0], B[0]
52
+
53
+ iface = gr.Interface(
54
+ predict,
55
+ title='Comparison between word vs BPE tokens based on ESPNet2 ASR models',
56
+ description='Two models were trained on Librispeech (clean-100h)',
57
+ inputs=[
58
+ gr.inputs.Audio(label='wav file', source='microphone', type='filepath')
59
+ ],
60
+ outputs=[
61
+ gr.outputs.Textbox(label='decoding result (word-token model)'),
62
+ gr.outputs.Textbox(label='decoding result (BPE-token model)'),
63
+ ],
64
+ examples=examples,
65
+ # article='<p style="text-align:center">Model URL<a target="_blank" href="https://huggingface.co/jkang/espnet2_librispeech_100_conformer">🤗</a></p>',
66
+ )
67
+
68
+ if __name__ == '__main__':
69
+ try:
70
+ iface.launch(debug=True,
71
+ # server_name=SERVER_NAME,
72
+ # server_port=SERVER_PORT,
73
+ enable_queue=True,
74
+ )
75
+ except KeyboardInterrupt as e:
76
+ print(e)
77
+
78
+ finally:
79
+ iface.close()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ cmake
2
+ libsndfile1
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ espnet==0.10.6
2
+ espnet_model_zoo==0.1.7
3
+ gradio
4
+ loguru==0.6.0
5
+ librosa
6
+ soundfile