pkufool csukuangfj commited on
Commit
8435a41
0 Parent(s):

Duplicate from k2-fsa/streaming-automatic-speech-recognition

Browse files

Co-authored-by: fangjun <csukuangfj@users.noreply.huggingface.co>

Files changed (6) hide show
  1. .gitattributes +34 -0
  2. .gitignore +2 -0
  3. README.md +14 -0
  4. app.py +205 -0
  5. model.py +162 -0
  6. requirements.txt +11 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ flagged/
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Streaming Automatic Speech Recognition
3
+ emoji: 📚
4
+ colorFrom: pink
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 2.9.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: k2-fsa/streaming-automatic-speech-recognition
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ #
3
+ # Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang)
4
+ #
5
+ # See LICENSE for clarification regarding multiple authors
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ # References:
20
+ # https://gradio.app/docs/#dropdown
21
+
22
+ import logging
23
+ import os
24
+ from typing import List, Optional
25
+
26
+ import gradio as gr
27
+ import torchaudio
28
+
29
+ from model import create_recognizer, language_to_models
30
+
31
+ title = "Next-gen Kaldi: Real-time streaming speech recognition"
32
+ description = """
33
+ This space shows how to do **real-time** streaming speech recognition
34
+ with **Next-gen Kaldi**.
35
+
36
+ Please visit
37
+ <https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>
38
+ for non-streaming speech recognition with **Next-gen Kaldi**.
39
+
40
+ It is running on CPU within a docker container provided by Hugging Face.
41
+
42
+ **Caution**: You may see **significant delay** since HuggingFace sends
43
+ your recorded data by chunks and the interval between chunks is
44
+ unknown, e.g., may be 2 seconds.
45
+ """
46
+
47
+ article = """
48
+ See more information by visiting the following links:
49
+
50
+ - <https://github.com/k2-fsa/icefall>
51
+ - <https://github.com/k2-fsa/sherpa>
52
+ - <https://github.com/k2-fsa/k2>
53
+ - <https://github.com/lhotse-speech/lhotse>
54
+
55
+ If you want to deploy it locally, please see
56
+ <https://k2-fsa.github.io/sherpa/>
57
+
58
+ Usage instructions:
59
+
60
+ (1) Select a language and a model from the dropdown box
61
+
62
+ (2) Click the Record button to start
63
+
64
+ (3) Speak
65
+
66
+ (4) Click the Stop Recording button to stop
67
+
68
+ (5) **Remember to click the Clear button before you re-click the Record button**
69
+
70
+ (6) **Remember to click the Clear button before you re-click the Record button**
71
+
72
+ (7) **Remember to click the Clear button before you re-click the Record button**
73
+ """
74
+
75
+
76
+ def convert_to_wav(in_filename: str) -> str:
77
+ """Convert the input audio file to a wave file"""
78
+ out_filename = in_filename + ".wav"
79
+ # logging.info(f"Converting '{in_filename}' to '{out_filename}'")
80
+ _ = os.system(
81
+ f"ffmpeg -hide_banner -loglevel error -i '{in_filename}' -ar 16000 '{out_filename}'"
82
+ )
83
+
84
+ return out_filename
85
+
86
+
87
+ def get_language_and_model() -> List[str]:
88
+ """
89
+ Each entry is of the following format:
90
+ language | repo_id
91
+ """
92
+ ans = []
93
+ for language, repo_id_list in language_to_models.items():
94
+ for repo_id in repo_id_list:
95
+ ans.append(f"{language} | {repo_id}")
96
+ return ans
97
+
98
+
99
+ language_model_list = get_language_and_model()
100
+
101
+
102
+ def process(language_and_repo_id: str, audio: Optional[str], state=None):
103
+ """
104
+ Args:
105
+ language_and_repo_id:
106
+ It contains "language | repo_id"
107
+ audio:
108
+ Path to the audio file. Not necessarily in wave format.
109
+ state:
110
+ If not None, it contains a list:
111
+ - error message if any
112
+ - language_and_repo_id
113
+ - recognizer
114
+ - stream
115
+ - wasOk
116
+ """
117
+ language, repo_id = language_and_repo_id.split("|")
118
+ language = language.strip()
119
+ repo_id = repo_id.strip()
120
+
121
+ if state is None:
122
+ print("language", language)
123
+ print("repo_id", repo_id)
124
+ recognizer = create_recognizer(repo_id)
125
+ stream = recognizer.create_stream()
126
+ state = ["", language_and_repo_id, recognizer, stream, True]
127
+
128
+ if not state[-1]:
129
+ return state[0], state
130
+
131
+ if audio is None:
132
+ if "Error" in state[0]:
133
+ return state[0], state
134
+ else:
135
+ recognizer = state[2]
136
+ stream = state[3]
137
+ return recognizer.get_result(stream).text.lower()
138
+
139
+ if state[1] != language_and_repo_id:
140
+ state[0] = (
141
+ "Error: Please don't change the language and model during recognition "
142
+ + "or "
143
+ + "please press the Clear button before you re-click Record or re-select "
144
+ + "language and model.\n\n\n"
145
+ + "Hint: Click Stop Recording and then press Clear to fix this error."
146
+ )
147
+ state[-1] = False
148
+
149
+ return state[0], state
150
+
151
+ filename = convert_to_wav(audio)
152
+
153
+ samples, sample_rate = torchaudio.load(filename)
154
+ assert sample_rate == 16000, (sample_rate, 16000)
155
+ samples = samples.squeeze(0)
156
+ duration = samples.numel() / 16000
157
+ # logging.info(f"duration: {duration} s")
158
+
159
+ recognizer = state[2]
160
+ stream = state[3]
161
+ stream.accept_waveform(16000, samples)
162
+
163
+ while recognizer.is_ready(stream):
164
+ recognizer.decode_stream(stream)
165
+
166
+ text = recognizer.get_result(stream).text.lower()
167
+ logging.info(text)
168
+
169
+ return text, state
170
+
171
+
172
+ language_dropdown = gr.inputs.Dropdown(
173
+ label="Select a language and a model",
174
+ choices=language_model_list,
175
+ default=language_model_list[0],
176
+ )
177
+
178
+ itf1 = gr.Interface(
179
+ title=title,
180
+ description=description,
181
+ article=article,
182
+ fn=process,
183
+ inputs=[
184
+ language_dropdown,
185
+ gr.inputs.Audio(
186
+ source="microphone",
187
+ type="filepath",
188
+ label="Press me to start recognition",
189
+ ),
190
+ "state",
191
+ ],
192
+ outputs=[
193
+ gr.outputs.Textbox(type="str", label="result"),
194
+ gr.outputs.State(label=""),
195
+ ],
196
+ live=True,
197
+ )
198
+
199
+
200
+ if __name__ == "__main__":
201
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
202
+
203
+ logging.basicConfig(format=formatter, level=logging.INFO)
204
+
205
+ itf1.launch()
model.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang)
2
+ #
3
+ # See LICENSE for clarification regarding multiple authors
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ import os
17
+
18
+ from huggingface_hub import hf_hub_download
19
+
20
+ os.system(
21
+ "cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/"
22
+ )
23
+
24
+ import sherpa # noqa
25
+
26
+
27
+ def _get_nn_model_filename(
28
+ repo_id: str,
29
+ filename: str,
30
+ subfolder: str = "exp",
31
+ ) -> str:
32
+ nn_model_filename = hf_hub_download(
33
+ repo_id=repo_id,
34
+ filename=filename,
35
+ subfolder=subfolder,
36
+ )
37
+ return nn_model_filename
38
+
39
+
40
+ def _get_token_filename(
41
+ repo_id: str,
42
+ filename: str = "tokens.txt",
43
+ subfolder: str = "data/lang_char",
44
+ ) -> str:
45
+ token_filename = hf_hub_download(
46
+ repo_id=repo_id,
47
+ filename=filename,
48
+ subfolder=subfolder,
49
+ )
50
+ return token_filename
51
+
52
+
53
+ def get_english_model_2022_12_19(repo_id: str):
54
+ encoder = _get_nn_model_filename(repo_id=repo_id, filename="encoder_jit_trace.pt")
55
+ decoder = _get_nn_model_filename(repo_id=repo_id, filename="decoder_jit_trace.pt")
56
+ joiner = _get_nn_model_filename(repo_id=repo_id, filename="joiner_jit_trace.pt")
57
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
58
+
59
+ feat_config = sherpa.FeatureConfig()
60
+ feat_config.fbank_opts.frame_opts.samp_freq = 16000
61
+ feat_config.fbank_opts.mel_opts.num_bins = 80
62
+ feat_config.fbank_opts.frame_opts.dither = 0
63
+
64
+ config = sherpa.OnlineRecognizerConfig(
65
+ nn_model="",
66
+ encoder_model=encoder,
67
+ decoder_model=decoder,
68
+ joiner_model=joiner,
69
+ tokens=tokens,
70
+ use_gpu=False,
71
+ feat_config=feat_config,
72
+ decoding_method="greedy_search",
73
+ chunk_size=32,
74
+ )
75
+
76
+ recognizer = sherpa.OnlineRecognizer(config)
77
+ return recognizer
78
+
79
+
80
+ def get_chinese_english_mixed_model_conv_emformer_transducer_stateless2_zh(
81
+ repo_id: str,
82
+ ) -> sherpa.OnlineRecognizer:
83
+ nn_model = _get_nn_model_filename(
84
+ repo_id=repo_id, filename="cpu_jit-epoch-11-avg-1.pt"
85
+ )
86
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_char_bpe")
87
+
88
+ feat_config = sherpa.FeatureConfig()
89
+ feat_config.fbank_opts.frame_opts.samp_freq = 16000
90
+ feat_config.fbank_opts.mel_opts.num_bins = 80
91
+ feat_config.fbank_opts.frame_opts.dither = 0
92
+
93
+ config = sherpa.OnlineRecognizerConfig(
94
+ nn_model=nn_model,
95
+ tokens=tokens,
96
+ use_gpu=False,
97
+ feat_config=feat_config,
98
+ decoding_method="greedy_search",
99
+ )
100
+ recognizer = sherpa.OnlineRecognizer(config)
101
+ return recognizer
102
+
103
+
104
+ def get_chinese_english_mixed_model_k2fsa_zipformer_chinese_english_mixed(
105
+ repo_id: str,
106
+ ) -> sherpa.OnlineRecognizer:
107
+ encoder = _get_nn_model_filename(repo_id=repo_id, filename="encoder_jit_trace.pt")
108
+ decoder = _get_nn_model_filename(repo_id=repo_id, filename="decoder_jit_trace.pt")
109
+ joiner = _get_nn_model_filename(repo_id=repo_id, filename="joiner_jit_trace.pt")
110
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_char_bpe")
111
+
112
+ feat_config = sherpa.FeatureConfig()
113
+ feat_config.fbank_opts.frame_opts.samp_freq = 16000
114
+ feat_config.fbank_opts.mel_opts.num_bins = 80
115
+ feat_config.fbank_opts.frame_opts.dither = 0
116
+
117
+ config = sherpa.OnlineRecognizerConfig(
118
+ nn_model="",
119
+ encoder_model=encoder,
120
+ decoder_model=decoder,
121
+ joiner_model=joiner,
122
+ tokens=tokens,
123
+ use_gpu=False,
124
+ feat_config=feat_config,
125
+ decoding_method="greedy_search",
126
+ chunk_size=32,
127
+ )
128
+
129
+ recognizer = sherpa.OnlineRecognizer(config)
130
+ return recognizer
131
+
132
+
133
+ def create_recognizer(repo_id: str) -> sherpa.OnlineRecognizer:
134
+ if repo_id in english_models:
135
+ return english_models[repo_id](repo_id)
136
+ elif repo_id in chinese_english_mixed_models:
137
+ return chinese_english_mixed_models[repo_id](repo_id)
138
+ else:
139
+ raise ValueError(f"Unsupported repo_id: {repo_id}")
140
+
141
+
142
+ english_models = {
143
+ # https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
144
+ "Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29": get_english_model_2022_12_19
145
+ }
146
+
147
+ chinese_english_mixed_models = {
148
+ # https://huggingface.co/pfluo/k2fsa-zipformer-chinese-english-mixed
149
+ "pfluo/k2fsa-zipformer-chinese-english-mixed": get_chinese_english_mixed_model_k2fsa_zipformer_chinese_english_mixed,
150
+ # https://huggingface.co/ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh
151
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh": get_chinese_english_mixed_model_conv_emformer_transducer_stateless2_zh,
152
+ }
153
+
154
+ all_models = {
155
+ **english_models,
156
+ **chinese_english_mixed_models,
157
+ }
158
+
159
+ language_to_models = {
160
+ "English": list(english_models.keys()),
161
+ "Chinese+English": list(chinese_english_mixed_models.keys()),
162
+ }
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://download.pytorch.org/whl/cpu/torch-1.13.1%2Bcpu-cp38-cp38-linux_x86_64.whl
2
+ https://download.pytorch.org/whl/cpu/torchaudio-0.13.1%2Bcpu-cp38-cp38-linux_x86_64.whl
3
+
4
+ https://huggingface.co/csukuangfj/wheels/resolve/main/2023-01-30/k2-1.23.4.dev20230130%2Bcpu.torch1.13.1-cp38-cp38-linux_x86_64.whl
5
+ https://huggingface.co/csukuangfj/wheels/resolve/main/2023-01-30/k2_sherpa-1.1-cp38-cp38-linux_x86_64.whl
6
+ https://huggingface.co/csukuangfj/wheels/resolve/main/2023-01-30/kaldifeat-1.22-cp38-cp38-linux_x86_64.whl
7
+
8
+ sentencepiece>=0.1.96
9
+ numpy
10
+
11
+ huggingface_hub