peteralexandercharles csukuangfj commited on
Commit
2fe5632
0 Parent(s):

Duplicate from EuroPython2022/automatic-speech-recognition-with-next-gen-kaldi

Browse files
Files changed (44) hide show
  1. .gitattributes +27 -0
  2. README.md +14 -0
  3. app.py +331 -0
  4. decode.py +121 -0
  5. examples.py +256 -0
  6. giga-tokens.txt +500 -0
  7. model.py +585 -0
  8. requirements.txt +11 -0
  9. test_wavs/aidatatang_200zh/README.md +2 -0
  10. test_wavs/aidatatang_200zh/T0055G0036S0002.wav +0 -0
  11. test_wavs/aidatatang_200zh/T0055G0036S0003.wav +0 -0
  12. test_wavs/aidatatang_200zh/T0055G0036S0004.wav +0 -0
  13. test_wavs/aishell2/ID0012W0030.wav +0 -0
  14. test_wavs/aishell2/ID0012W0162.wav +0 -0
  15. test_wavs/aishell2/ID0012W0215.wav +0 -0
  16. test_wavs/aishell2/README.md +2 -0
  17. test_wavs/aishell2/trans.txt +3 -0
  18. test_wavs/arabic/a.wav +0 -0
  19. test_wavs/arabic/b.wav +0 -0
  20. test_wavs/arabic/c.wav +0 -0
  21. test_wavs/arabic/trans.txt +3 -0
  22. test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav +0 -0
  23. test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav +0 -0
  24. test_wavs/gigaspeech/1-minute-audiobook.opus +0 -0
  25. test_wavs/gigaspeech/100-seconds-podcast.opus +0 -0
  26. test_wavs/gigaspeech/100-seconds-youtube.opus +0 -0
  27. test_wavs/librispeech/1089-134686-0001.wav +0 -0
  28. test_wavs/librispeech/1221-135766-0001.wav +0 -0
  29. test_wavs/librispeech/1221-135766-0002.wav +0 -0
  30. test_wavs/librispeech/README.md +2 -0
  31. test_wavs/librispeech/trans.txt +3 -0
  32. test_wavs/tal_csasr/0.wav +0 -0
  33. test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_132.wav +0 -0
  34. test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_138.wav +0 -0
  35. test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav +0 -0
  36. test_wavs/tal_csasr/README.md +2 -0
  37. test_wavs/tibetan/a_0_cacm-A70_31116.wav +0 -0
  38. test_wavs/tibetan/a_0_cacm-A70_31117.wav +0 -0
  39. test_wavs/tibetan/a_0_cacm-A70_31118.wav +0 -0
  40. test_wavs/tibetan/trans.txt +3 -0
  41. test_wavs/wenetspeech/DEV_T0000000000.opus +0 -0
  42. test_wavs/wenetspeech/DEV_T0000000001.opus +0 -0
  43. test_wavs/wenetspeech/DEV_T0000000002.opus +0 -0
  44. test_wavs/wenetspeech/README.md +2 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Automatic Speech Recognition
3
+ emoji: 🌖
4
+ colorFrom: yellow
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.0.26
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: EuroPython2022/automatic-speech-recognition-with-next-gen-kaldi
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ #
3
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
4
+ #
5
+ # See LICENSE for clarification regarding multiple authors
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ # References:
20
+ # https://gradio.app/docs/#dropdown
21
+
22
+ import logging
23
+ import os
24
+ import time
25
+ from datetime import datetime
26
+
27
+ import gradio as gr
28
+ import torch
29
+ import torchaudio
30
+
31
+ from examples import examples
32
+ from model import get_pretrained_model, language_to_models, sample_rate
33
+
34
+ languages = list(language_to_models.keys())
35
+
36
+
37
+ def convert_to_wav(in_filename: str) -> str:
38
+ """Convert the input audio file to a wave file"""
39
+ out_filename = in_filename + ".wav"
40
+ logging.info(f"Converting '{in_filename}' to '{out_filename}'")
41
+ _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 '{out_filename}'")
42
+ return out_filename
43
+
44
+
45
+ def build_html_output(s: str, style: str = "result_item_success"):
46
+ return f"""
47
+ <div class='result'>
48
+ <div class='result_item {style}'>
49
+ {s}
50
+ </div>
51
+ </div>
52
+ """
53
+
54
+
55
+ def process_uploaded_file(
56
+ language: str,
57
+ repo_id: str,
58
+ decoding_method: str,
59
+ num_active_paths: int,
60
+ in_filename: str,
61
+ ):
62
+ if in_filename is None or in_filename == "":
63
+ return "", build_html_output(
64
+ "Please first upload a file and then click "
65
+ 'the button "submit for recognition"',
66
+ "result_item_error",
67
+ )
68
+
69
+ logging.info(f"Processing uploaded file: {in_filename}")
70
+ try:
71
+ return process(
72
+ in_filename=in_filename,
73
+ language=language,
74
+ repo_id=repo_id,
75
+ decoding_method=decoding_method,
76
+ num_active_paths=num_active_paths,
77
+ )
78
+ except Exception as e:
79
+ logging.info(str(e))
80
+ return "", build_html_output(str(e), "result_item_error")
81
+
82
+
83
+ def process_microphone(
84
+ language: str,
85
+ repo_id: str,
86
+ decoding_method: str,
87
+ num_active_paths: int,
88
+ in_filename: str,
89
+ ):
90
+ if in_filename is None or in_filename == "":
91
+ return "", build_html_output(
92
+ "Please first click 'Record from microphone', speak, "
93
+ "click 'Stop recording', and then "
94
+ "click the button 'submit for recognition'",
95
+ "result_item_error",
96
+ )
97
+
98
+ logging.info(f"Processing microphone: {in_filename}")
99
+ try:
100
+ return process(
101
+ in_filename=in_filename,
102
+ language=language,
103
+ repo_id=repo_id,
104
+ decoding_method=decoding_method,
105
+ num_active_paths=num_active_paths,
106
+ )
107
+ except Exception as e:
108
+ logging.info(str(e))
109
+ return "", build_html_output(str(e), "result_item_error")
110
+
111
+
112
+ @torch.no_grad()
113
+ def process(
114
+ language: str,
115
+ repo_id: str,
116
+ decoding_method: str,
117
+ num_active_paths: int,
118
+ in_filename: str,
119
+ ):
120
+ logging.info(f"language: {language}")
121
+ logging.info(f"repo_id: {repo_id}")
122
+ logging.info(f"decoding_method: {decoding_method}")
123
+ logging.info(f"num_active_paths: {num_active_paths}")
124
+ logging.info(f"in_filename: {in_filename}")
125
+
126
+ filename = convert_to_wav(in_filename)
127
+
128
+ now = datetime.now()
129
+ date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
130
+ logging.info(f"Started at {date_time}")
131
+
132
+ start = time.time()
133
+
134
+ recognizer = get_pretrained_model(
135
+ repo_id,
136
+ decoding_method=decoding_method,
137
+ num_active_paths=num_active_paths,
138
+ )
139
+ s = recognizer.create_stream()
140
+
141
+ s.accept_wave_file(filename)
142
+ recognizer.decode_stream(s)
143
+
144
+ text = s.result.text
145
+
146
+ date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
147
+ end = time.time()
148
+
149
+ metadata = torchaudio.info(filename)
150
+ duration = metadata.num_frames / sample_rate
151
+ rtf = (end - start) / duration
152
+
153
+ logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
154
+
155
+ info = f"""
156
+ Wave duration : {duration: .3f} s <br/>
157
+ Processing time: {end - start: .3f} s <br/>
158
+ RTF: {end - start: .3f}/{duration: .3f} = {rtf:.3f} <br/>
159
+ """
160
+ if rtf > 1:
161
+ info += (
162
+ "<br/>We are loading the model for the first run. "
163
+ "Please run again to measure the real RTF.<br/>"
164
+ )
165
+
166
+ logging.info(info)
167
+ logging.info(f"\nrepo_id: {repo_id}\nhyp: {text}")
168
+
169
+ return text, build_html_output(info)
170
+
171
+
172
+ title = "# Automatic Speech Recognition with Next-gen Kaldi"
173
+ description = """
174
+ This space shows how to do automatic speech recognition with Next-gen Kaldi.
175
+
176
+ It is running on CPU within a docker container provided by Hugging Face.
177
+
178
+ See more information by visiting the following links:
179
+
180
+ - <https://github.com/k2-fsa/icefall>
181
+ - <https://github.com/k2-fsa/sherpa>
182
+ - <https://github.com/k2-fsa/k2>
183
+ - <https://github.com/lhotse-speech/lhotse>
184
+
185
+ If you want to deploy it locally, please see
186
+ <https://k2-fsa.github.io/sherpa/>
187
+ """
188
+
189
+ # css style is copied from
190
+ # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
191
+ css = """
192
+ .result {display:flex;flex-direction:column}
193
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
194
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
195
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
196
+ """
197
+
198
+
199
+ def update_model_dropdown(language: str):
200
+ if language in language_to_models:
201
+ choices = language_to_models[language]
202
+ return gr.Dropdown.update(choices=choices, value=choices[0])
203
+
204
+ raise ValueError(f"Unsupported language: {language}")
205
+
206
+
207
+ demo = gr.Blocks(css=css)
208
+
209
+
210
+ with demo:
211
+ gr.Markdown(title)
212
+ language_choices = list(language_to_models.keys())
213
+
214
+ language_radio = gr.Radio(
215
+ label="Language",
216
+ choices=language_choices,
217
+ value=language_choices[0],
218
+ )
219
+ model_dropdown = gr.Dropdown(
220
+ choices=language_to_models[language_choices[0]],
221
+ label="Select a model",
222
+ value=language_to_models[language_choices[0]][0],
223
+ )
224
+
225
+ language_radio.change(
226
+ update_model_dropdown,
227
+ inputs=language_radio,
228
+ outputs=model_dropdown,
229
+ )
230
+
231
+ decoding_method_radio = gr.Radio(
232
+ label="Decoding method",
233
+ choices=["greedy_search", "modified_beam_search"],
234
+ value="greedy_search",
235
+ )
236
+
237
+ num_active_paths_slider = gr.Slider(
238
+ minimum=1,
239
+ value=4,
240
+ step=1,
241
+ label="Number of active paths for modified_beam_search",
242
+ )
243
+
244
+ with gr.Tabs():
245
+ with gr.TabItem("Upload from disk"):
246
+ uploaded_file = gr.Audio(
247
+ source="upload", # Choose between "microphone", "upload"
248
+ type="filepath",
249
+ optional=False,
250
+ label="Upload from disk",
251
+ )
252
+ upload_button = gr.Button("Submit for recognition")
253
+ uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
254
+ uploaded_html_info = gr.HTML(label="Info")
255
+
256
+ gr.Examples(
257
+ examples=examples,
258
+ inputs=[
259
+ language_radio,
260
+ model_dropdown,
261
+ decoding_method_radio,
262
+ num_active_paths_slider,
263
+ uploaded_file,
264
+ ],
265
+ outputs=[uploaded_output, uploaded_html_info],
266
+ fn=process_uploaded_file,
267
+ )
268
+
269
+ with gr.TabItem("Record from microphone"):
270
+ microphone = gr.Audio(
271
+ source="microphone", # Choose between "microphone", "upload"
272
+ type="filepath",
273
+ optional=False,
274
+ label="Record from microphone",
275
+ )
276
+
277
+ record_button = gr.Button("Submit for recognition")
278
+ recorded_output = gr.Textbox(label="Recognized speech from recordings")
279
+ recorded_html_info = gr.HTML(label="Info")
280
+
281
+ gr.Examples(
282
+ examples=examples,
283
+ inputs=[
284
+ language_radio,
285
+ model_dropdown,
286
+ decoding_method_radio,
287
+ num_active_paths_slider,
288
+ microphone,
289
+ ],
290
+ outputs=[recorded_output, recorded_html_info],
291
+ fn=process_microphone,
292
+ )
293
+
294
+ upload_button.click(
295
+ process_uploaded_file,
296
+ inputs=[
297
+ language_radio,
298
+ model_dropdown,
299
+ decoding_method_radio,
300
+ num_active_paths_slider,
301
+ uploaded_file,
302
+ ],
303
+ outputs=[uploaded_output, uploaded_html_info],
304
+ )
305
+
306
+ record_button.click(
307
+ process_microphone,
308
+ inputs=[
309
+ language_radio,
310
+ model_dropdown,
311
+ decoding_method_radio,
312
+ num_active_paths_slider,
313
+ microphone,
314
+ ],
315
+ outputs=[recorded_output, recorded_html_info],
316
+ )
317
+ gr.Markdown(description)
318
+
319
+ torch.set_num_threads(1)
320
+ torch.set_num_interop_threads(1)
321
+
322
+ torch._C._jit_set_profiling_executor(False)
323
+ torch._C._jit_set_profiling_mode(False)
324
+ torch._C._set_graph_executor_optimize(False)
325
+
326
+ if __name__ == "__main__":
327
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
328
+
329
+ logging.basicConfig(format=formatter, level=logging.INFO)
330
+
331
+ demo.launch()
decode.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
2
+ #
3
+ # Copied from https://github.com/k2-fsa/sherpa/blob/master/sherpa/bin/conformer_rnnt/decode.py
4
+ #
5
+ # See LICENSE for clarification regarding multiple authors
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ import math
20
+ from typing import List
21
+
22
+ import torch
23
+ from sherpa import RnntConformerModel, greedy_search, modified_beam_search
24
+ from torch.nn.utils.rnn import pad_sequence
25
+
26
+ LOG_EPS = math.log(1e-10)
27
+
28
+
29
+ @torch.no_grad()
30
+ def run_model_and_do_greedy_search(
31
+ model: RnntConformerModel,
32
+ features: List[torch.Tensor],
33
+ ) -> List[List[int]]:
34
+ """Run RNN-T model with the given features and use greedy search
35
+ to decode the output of the model.
36
+
37
+ Args:
38
+ model:
39
+ The RNN-T model.
40
+ features:
41
+ A list of 2-D tensors. Each entry is of shape
42
+ (num_frames, feature_dim).
43
+ Returns:
44
+ Return a list-of-list containing the decoding token IDs.
45
+ """
46
+ features_length = torch.tensor(
47
+ [f.size(0) for f in features],
48
+ dtype=torch.int64,
49
+ )
50
+ features = pad_sequence(
51
+ features,
52
+ batch_first=True,
53
+ padding_value=LOG_EPS,
54
+ )
55
+
56
+ device = model.device
57
+ features = features.to(device)
58
+ features_length = features_length.to(device)
59
+
60
+ encoder_out, encoder_out_length = model.encoder(
61
+ features=features,
62
+ features_length=features_length,
63
+ )
64
+
65
+ hyp_tokens = greedy_search(
66
+ model=model,
67
+ encoder_out=encoder_out,
68
+ encoder_out_length=encoder_out_length.cpu(),
69
+ )
70
+ return hyp_tokens
71
+
72
+
73
+ @torch.no_grad()
74
+ def run_model_and_do_modified_beam_search(
75
+ model: RnntConformerModel,
76
+ features: List[torch.Tensor],
77
+ num_active_paths: int,
78
+ ) -> List[List[int]]:
79
+ """Run RNN-T model with the given features and use greedy search
80
+ to decode the output of the model.
81
+
82
+ Args:
83
+ model:
84
+ The RNN-T model.
85
+ features:
86
+ A list of 2-D tensors. Each entry is of shape
87
+ (num_frames, feature_dim).
88
+ num_active_paths:
89
+ Used only when decoding_method is modified_beam_search.
90
+ It specifies number of active paths for each utterance. Due to
91
+ merging paths with identical token sequences, the actual number
92
+ may be less than "num_active_paths".
93
+ Returns:
94
+ Return a list-of-list containing the decoding token IDs.
95
+ """
96
+ features_length = torch.tensor(
97
+ [f.size(0) for f in features],
98
+ dtype=torch.int64,
99
+ )
100
+ features = pad_sequence(
101
+ features,
102
+ batch_first=True,
103
+ padding_value=LOG_EPS,
104
+ )
105
+
106
+ device = model.device
107
+ features = features.to(device)
108
+ features_length = features_length.to(device)
109
+
110
+ encoder_out, encoder_out_length = model.encoder(
111
+ features=features,
112
+ features_length=features_length,
113
+ )
114
+
115
+ hyp_tokens = modified_beam_search(
116
+ model=model,
117
+ encoder_out=encoder_out,
118
+ encoder_out_length=encoder_out_length.cpu(),
119
+ num_active_paths=num_active_paths,
120
+ )
121
+ return hyp_tokens
examples.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ #
3
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
4
+ #
5
+ # See LICENSE for clarification regarding multiple authors
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ examples = [
19
+ [
20
+ "Chinese+English",
21
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
22
+ "greedy_search",
23
+ 4,
24
+ "./test_wavs/tal_csasr/0.wav",
25
+ ],
26
+ [
27
+ "English",
28
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
29
+ "greedy_search",
30
+ 4,
31
+ "./test_wavs/librispeech/1089-134686-0001.wav",
32
+ ],
33
+ [
34
+ "Chinese",
35
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
36
+ "greedy_search",
37
+ 4,
38
+ "./test_wavs/wenetspeech/DEV_T0000000000.opus",
39
+ ],
40
+ [
41
+ "German",
42
+ "csukuangfj/wav2vec2.0-torchaudio",
43
+ "greedy_search",
44
+ 4,
45
+ "./test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav",
46
+ ],
47
+ [
48
+ "Arabic",
49
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
50
+ "greedy_search",
51
+ 4,
52
+ "./test_wavs/arabic/a.wav",
53
+ ],
54
+ [
55
+ "Tibetan",
56
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
57
+ "greedy_search",
58
+ 4,
59
+ "./test_wavs/tibetan/a_0_cacm-A70_31117.wav",
60
+ ],
61
+ # librispeech
62
+ # https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13/tree/main/test_wavs
63
+ [
64
+ "English",
65
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
66
+ "greedy_search",
67
+ 4,
68
+ "./test_wavs/librispeech/1089-134686-0001.wav",
69
+ ],
70
+ [
71
+ "English",
72
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
73
+ "greedy_search",
74
+ 4,
75
+ "./test_wavs/librispeech/1221-135766-0001.wav",
76
+ ],
77
+ [
78
+ "English",
79
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
80
+ "greedy_search",
81
+ 4,
82
+ "./test_wavs/librispeech/1221-135766-0002.wav",
83
+ ],
84
+ # gigaspeech
85
+ [
86
+ "English",
87
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
88
+ "greedy_search",
89
+ 4,
90
+ "./test_wavs/gigaspeech/1-minute-audiobook.opus",
91
+ ],
92
+ [
93
+ "English",
94
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
95
+ "greedy_search",
96
+ 4,
97
+ "./test_wavs/gigaspeech/100-seconds-podcast.opus",
98
+ ],
99
+ [
100
+ "English",
101
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
102
+ "greedy_search",
103
+ 4,
104
+ "./test_wavs/gigaspeech/100-seconds-youtube.opus",
105
+ ],
106
+ # wenetspeech
107
+ # https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2/tree/main/test_wavs
108
+ [
109
+ "Chinese",
110
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
111
+ "greedy_search",
112
+ 4,
113
+ "./test_wavs/wenetspeech/DEV_T0000000000.opus",
114
+ ],
115
+ [
116
+ "Chinese",
117
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
118
+ "greedy_search",
119
+ 4,
120
+ "./test_wavs/wenetspeech/DEV_T0000000001.opus",
121
+ ],
122
+ [
123
+ "Chinese",
124
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
125
+ "greedy_search",
126
+ 4,
127
+ "./test_wavs/wenetspeech/DEV_T0000000002.opus",
128
+ ],
129
+ # aishell2-A
130
+ # https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12/tree/main/test_wavs
131
+ [
132
+ "Chinese",
133
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",
134
+ "greedy_search",
135
+ 4,
136
+ "./test_wavs/aishell2/ID0012W0030.wav",
137
+ ],
138
+ [
139
+ "Chinese",
140
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",
141
+ "greedy_search",
142
+ 4,
143
+ "./test_wavs/aishell2/ID0012W0162.wav",
144
+ ],
145
+ [
146
+ "Chinese",
147
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",
148
+ "greedy_search",
149
+ 4,
150
+ "./test_wavs/aishell2/ID0012W0215.wav",
151
+ ],
152
+ # aishell2-B
153
+ # https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12/tree/main/test_wavs
154
+ [
155
+ "Chinese",
156
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",
157
+ "greedy_search",
158
+ 4,
159
+ "./test_wavs/aishell2/ID0012W0030.wav",
160
+ ],
161
+ [
162
+ "Chinese",
163
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",
164
+ "greedy_search",
165
+ 4,
166
+ "./test_wavs/aishell2/ID0012W0162.wav",
167
+ ],
168
+ [
169
+ "Chinese",
170
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",
171
+ "greedy_search",
172
+ 4,
173
+ "./test_wavs/aishell2/ID0012W0215.wav",
174
+ ],
175
+ # aishell2-B
176
+ # https://huggingface.co/luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2/tree/main/test_wavs
177
+ [
178
+ "Chinese",
179
+ "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
180
+ "greedy_search",
181
+ 4,
182
+ "./test_wavs/aidatatang_200zh/T0055G0036S0002.wav",
183
+ ],
184
+ [
185
+ "Chinese",
186
+ "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
187
+ "greedy_search",
188
+ 4,
189
+ "./test_wavs/aidatatang_200zh/T0055G0036S0003.wav",
190
+ ],
191
+ [
192
+ "Chinese",
193
+ "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
194
+ "greedy_search",
195
+ 4,
196
+ "./test_wavs/aidatatang_200zh/T0055G0036S0004.wav",
197
+ ],
198
+ # tal_csasr
199
+ [
200
+ "Chinese+English",
201
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
202
+ "greedy_search",
203
+ 4,
204
+ "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_132.wav",
205
+ ],
206
+ [
207
+ "Chinese+English",
208
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
209
+ "greedy_search",
210
+ 4,
211
+ "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_138.wav",
212
+ ],
213
+ [
214
+ "Chinese+English",
215
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
216
+ "greedy_search",
217
+ 4,
218
+ "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav",
219
+ ],
220
+ [
221
+ "Tibetan",
222
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
223
+ "greedy_search",
224
+ 4,
225
+ "./test_wavs/tibetan/a_0_cacm-A70_31116.wav",
226
+ ],
227
+ [
228
+ "Tibetan",
229
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
230
+ "greedy_search",
231
+ 4,
232
+ "./test_wavs/tibetan/a_0_cacm-A70_31118.wav",
233
+ ],
234
+ # arabic
235
+ [
236
+ "Arabic",
237
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
238
+ "greedy_search",
239
+ 4,
240
+ "./test_wavs/arabic/b.wav",
241
+ ],
242
+ [
243
+ "Arabic",
244
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
245
+ "greedy_search",
246
+ 4,
247
+ "./test_wavs/arabic/c.wav",
248
+ ],
249
+ [
250
+ "German",
251
+ "csukuangfj/wav2vec2.0-torchaudio",
252
+ "greedy_search",
253
+ 4,
254
+ "./test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav",
255
+ ],
256
+ ]
giga-tokens.txt ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <blk> 0
2
+ <sos/eos> 1
3
+ <unk> 2
4
+ S 3
5
+ T 4
6
+ ▁THE 5
7
+ ▁A 6
8
+ E 7
9
+ ▁AND 8
10
+ ▁TO 9
11
+ N 10
12
+ D 11
13
+ ▁OF 12
14
+ ' 13
15
+ ING 14
16
+ ▁I 15
17
+ Y 16
18
+ ▁IN 17
19
+ ED 18
20
+ ▁THAT 19
21
+ ▁ 20
22
+ P 21
23
+ R 22
24
+ ▁YOU 23
25
+ M 24
26
+ RE 25
27
+ ER 26
28
+ C 27
29
+ O 28
30
+ ▁IT 29
31
+ L 30
32
+ A 31
33
+ U 32
34
+ G 33
35
+ ▁WE 34
36
+ ▁IS 35
37
+ ▁SO 36
38
+ AL 37
39
+ I 38
40
+ ▁S 39
41
+ ▁RE 40
42
+ AR 41
43
+ B 42
44
+ ▁FOR 43
45
+ ▁C 44
46
+ ▁BE 45
47
+ LE 46
48
+ F 47
49
+ W 48
50
+ ▁E 49
51
+ ▁HE 50
52
+ LL 51
53
+ ▁WAS 52
54
+ LY 53
55
+ OR 54
56
+ IN 55
57
+ ▁F 56
58
+ VE 57
59
+ ▁THIS 58
60
+ TH 59
61
+ K 60
62
+ ▁ON 61
63
+ IT 62
64
+ ▁B 63
65
+ ▁WITH 64
66
+ ▁BUT 65
67
+ EN 66
68
+ CE 67
69
+ RI 68
70
+ ▁DO 69
71
+ UR 70
72
+ ▁HAVE 71
73
+ ▁DE 72
74
+ ▁ME 73
75
+ ▁T 74
76
+ ENT 75
77
+ CH 76
78
+ ▁THEY 77
79
+ ▁NOT 78
80
+ ES 79
81
+ V 80
82
+ ▁AS 81
83
+ RA 82
84
+ ▁P 83
85
+ ON 84
86
+ TER 85
87
+ ▁ARE 86
88
+ ▁WHAT 87
89
+ IC 88
90
+ ▁ST 89
91
+ ▁LIKE 90
92
+ ATION 91
93
+ ▁OR 92
94
+ ▁CA 93
95
+ ▁AT 94
96
+ H 95
97
+ ▁KNOW 96
98
+ ▁G 97
99
+ AN 98
100
+ ▁CON 99
101
+ IL 100
102
+ ND 101
103
+ RO 102
104
+ ▁HIS 103
105
+ ▁CAN 104
106
+ ▁ALL 105
107
+ TE 106
108
+ ▁THERE 107
109
+ ▁SU 108
110
+ ▁MO 109
111
+ ▁MA 110
112
+ LI 111
113
+ ▁ONE 112
114
+ ▁ABOUT 113
115
+ LA 114
116
+ ▁CO 115
117
+ - 116
118
+ ▁MY 117
119
+ ▁HAD 118
120
+ CK 119
121
+ NG 120
122
+ ▁NO 121
123
+ MENT 122
124
+ AD 123
125
+ LO 124
126
+ ME 125
127
+ ▁AN 126
128
+ ▁FROM 127
129
+ NE 128
130
+ ▁IF 129
131
+ VER 130
132
+ ▁JUST 131
133
+ ▁PRO 132
134
+ ION 133
135
+ ▁PA 134
136
+ ▁WHO 135
137
+ ▁SE 136
138
+ EL 137
139
+ IR 138
140
+ ▁US 139
141
+ ▁UP 140
142
+ ▁YOUR 141
143
+ CI 142
144
+ RY 143
145
+ ▁GO 144
146
+ ▁SHE 145
147
+ ▁LE 146
148
+ ▁OUT 147
149
+ ▁PO 148
150
+ ▁HO 149
151
+ ATE 150
152
+ ▁BO 151
153
+ ▁BY 152
154
+ ▁FA 153
155
+ ▁MI 154
156
+ AS 155
157
+ MP 156
158
+ ▁HER 157
159
+ VI 158
160
+ ▁THINK 159
161
+ ▁SOME 160
162
+ ▁WHEN 161
163
+ ▁AH 162
164
+ ▁PEOPLE 163
165
+ IG 164
166
+ ▁WA 165
167
+ ▁TE 166
168
+ ▁LA 167
169
+ ▁WERE 168
170
+ ▁LI 169
171
+ ▁WOULD 170
172
+ ▁SEE 171
173
+ ▁WHICH 172
174
+ DE 173
175
+ GE 174
176
+ ▁K 175
177
+ IGHT 176
178
+ ▁HA 177
179
+ ▁OUR 178
180
+ UN 179
181
+ ▁HOW 180
182
+ ▁GET 181
183
+ IS 182
184
+ UT 183
185
+ Z 184
186
+ CO 185
187
+ ET 186
188
+ UL 187
189
+ IES 188
190
+ IVE 189
191
+ AT 190
192
+ ▁O 191
193
+ ▁DON 192
194
+ LU 193
195
+ ▁TIME 194
196
+ ▁WILL 195
197
+ ▁MORE 196
198
+ ▁SP 197
199
+ ▁NOW 198
200
+ RU 199
201
+ ▁THEIR 200
202
+ ▁UN 201
203
+ ITY 202
204
+ OL 203
205
+ X 204
206
+ TI 205
207
+ US 206
208
+ ▁VERY 207
209
+ TION 208
210
+ ▁FI 209
211
+ ▁SAY 210
212
+ ▁BECAUSE 211
213
+ ▁EX 212
214
+ ▁RO 213
215
+ ERS 214
216
+ IST 215
217
+ ▁DA 216
218
+ TING 217
219
+ ▁EN 218
220
+ OM 219
221
+ ▁BA 220
222
+ ▁BEEN 221
223
+ ▁LO 222
224
+ ▁UM 223
225
+ AGE 224
226
+ ABLE 225
227
+ ▁WO 226
228
+ ▁RA 227
229
+ ▁OTHER 228
230
+ ▁REALLY 229
231
+ ENCE 230
232
+ ▁GOING 231
233
+ ▁HIM 232
234
+ ▁HAS 233
235
+ ▁THEM 234
236
+ ▁DIS 235
237
+ ▁WANT 236
238
+ ID 237
239
+ TA 238
240
+ ▁LOOK 239
241
+ KE 240
242
+ ▁DID 241
243
+ ▁SA 242
244
+ ▁VI 243
245
+ ▁SAID 244
246
+ ▁RIGHT 245
247
+ ▁THESE 246
248
+ ▁WORK 247
249
+ ▁COM 248
250
+ ALLY 249
251
+ FF 250
252
+ QU 251
253
+ AC 252
254
+ ▁DR 253
255
+ ▁WAY 254
256
+ ▁INTO 255
257
+ MO 256
258
+ TED 257
259
+ EST 258
260
+ ▁HERE 259
261
+ OK 260
262
+ ▁COULD 261
263
+ ▁WELL 262
264
+ MA 263
265
+ ▁PRE 264
266
+ ▁DI 265
267
+ MAN 266
268
+ ▁COMP 267
269
+ ▁THEN 268
270
+ IM 269
271
+ ▁PER 270
272
+ ▁NA 271
273
+ ▁WHERE 272
274
+ ▁TWO 273
275
+ ▁WI 274
276
+ ▁FE 275
277
+ INE 276
278
+ ▁ANY 277
279
+ TURE 278
280
+ ▁OVER 279
281
+ BO 280
282
+ ACH 281
283
+ OW 282
284
+ ▁MAKE 283
285
+ ▁TRA 284
286
+ HE 285
287
+ UND 286
288
+ ▁EVEN 287
289
+ ANCE 288
290
+ ▁YEAR 289
291
+ HO 290
292
+ AM 291
293
+ ▁CHA 292
294
+ ▁BACK 293
295
+ VO 294
296
+ ANT 295
297
+ DI 296
298
+ ▁ALSO 297
299
+ ▁THOSE 298
300
+ ▁MAN 299
301
+ CTION 300
302
+ ICAL 301
303
+ ▁JO 302
304
+ ▁OP 303
305
+ ▁NEW 304
306
+ ▁MU 305
307
+ ▁HU 306
308
+ ▁KIND 307
309
+ ▁NE 308
310
+ CA 309
311
+ END 310
312
+ TIC 311
313
+ FUL 312
314
+ ▁YEAH 313
315
+ SH 314
316
+ ▁APP 315
317
+ ▁THINGS 316
318
+ SIDE 317
319
+ ▁GOOD 318
320
+ ONE 319
321
+ ▁TAKE 320
322
+ CU 321
323
+ ▁EVERY 322
324
+ ▁MEAN 323
325
+ ▁FIRST 324
326
+ OP 325
327
+ ▁TH 326
328
+ ▁MUCH 327
329
+ ▁PART 328
330
+ UGH 329
331
+ ▁COME 330
332
+ J 331
333
+ ▁THAN 332
334
+ ▁EXP 333
335
+ ▁AGAIN 334
336
+ ▁LITTLE 335
337
+ MB 336
338
+ ▁NEED 337
339
+ ▁TALK 338
340
+ IF 339
341
+ FOR 340
342
+ ▁SH 341
343
+ ISH 342
344
+ ▁STA 343
345
+ ATED 344
346
+ ▁GU 345
347
+ ▁LET 346
348
+ IA 347
349
+ ▁MAR 348
350
+ ▁DOWN 349
351
+ ▁DAY 350
352
+ ▁GA 351
353
+ ▁SOMETHING 352
354
+ ▁BU 353
355
+ DUC 354
356
+ HA 355
357
+ ▁LOT 356
358
+ ▁RU 357
359
+ ▁THOUGH 358
360
+ ▁GREAT 359
361
+ AIN 360
362
+ ▁THROUGH 361
363
+ ▁THING 362
364
+ OUS 363
365
+ ▁PRI 364
366
+ ▁GOT 365
367
+ ▁SHOULD 366
368
+ ▁AFTER 367
369
+ ▁HEAR 368
370
+ ▁TA 369
371
+ ▁ONLY 370
372
+ ▁CHI 371
373
+ IOUS 372
374
+ ▁SHA 373
375
+ ▁MOST 374
376
+ ▁ACTUALLY 375
377
+ ▁START 376
378
+ LIC 377
379
+ ▁VA 378
380
+ ▁RI 379
381
+ DAY 380
382
+ IAN 381
383
+ ▁DOES 382
384
+ ROW 383
385
+ ▁GRA 384
386
+ ITION 385
387
+ ▁MANY 386
388
+ ▁BEFORE 387
389
+ ▁GIVE 388
390
+ PORT 389
391
+ QUI 390
392
+ ▁LIFE 391
393
+ ▁WORLD 392
394
+ ▁PI 393
395
+ ▁LONG 394
396
+ ▁THREE 395
397
+ IZE 396
398
+ NESS 397
399
+ ▁SHOW 398
400
+ PH 399
401
+ ▁WHY 400
402
+ ▁QUESTION 401
403
+ WARD 402
404
+ ▁THANK 403
405
+ ▁PH 404
406
+ ▁DIFFERENT 405
407
+ ▁OWN 406
408
+ ▁FEEL 407
409
+ ▁MIGHT 408
410
+ ▁HAPPEN 409
411
+ ▁MADE 410
412
+ ▁BRO 411
413
+ IBLE 412
414
+ ▁HI 413
415
+ ▁STATE 414
416
+ ▁HAND 415
417
+ ▁NEVER 416
418
+ ▁PLACE 417
419
+ ▁LOVE 418
420
+ ▁DU 419
421
+ ▁POINT 420
422
+ ▁HELP 421
423
+ ▁COUNT 422
424
+ ▁STILL 423
425
+ ▁MR 424
426
+ ▁FIND 425
427
+ ▁PERSON 426
428
+ ▁CAME 427
429
+ ▁SAME 428
430
+ ▁LAST 429
431
+ ▁HIGH 430
432
+ ▁OLD 431
433
+ ▁UNDER 432
434
+ ▁FOUR 433
435
+ ▁AROUND 434
436
+ ▁SORT 435
437
+ ▁CHANGE 436
438
+ ▁YES 437
439
+ SHIP 438
440
+ ▁ANOTHER 439
441
+ ATIVE 440
442
+ ▁FOUND 441
443
+ ▁JA 442
444
+ ▁ALWAYS 443
445
+ ▁NEXT 444
446
+ ▁TURN 445
447
+ ▁JU 446
448
+ ▁SIX 447
449
+ ▁FACT 448
450
+ ▁INTEREST 449
451
+ ▁WORD 450
452
+ ▁THOUSAND 451
453
+ ▁HUNDRED 452
454
+ ▁NUMBER 453
455
+ ▁IDEA 454
456
+ ▁PLAN 455
457
+ ▁COURSE 456
458
+ ▁SCHOOL 457
459
+ ▁HOUSE 458
460
+ ▁TWENTY 459
461
+ ▁JE 460
462
+ ▁PLAY 461
463
+ ▁AWAY 462
464
+ ▁LEARN 463
465
+ ▁HARD 464
466
+ ▁WEEK 465
467
+ ▁BETTER 466
468
+ ▁WHILE 467
469
+ ▁FRIEND 468
470
+ ▁OKAY 469
471
+ ▁NINE 470
472
+ ▁UNDERSTAND 471
473
+ ▁KEEP 472
474
+ ▁GONNA 473
475
+ ▁SYSTEM 474
476
+ ▁AMERICA 475
477
+ ▁POWER 476
478
+ ▁IMPORTANT 477
479
+ ▁WITHOUT 478
480
+ ▁MAYBE 479
481
+ ▁SEVEN 480
482
+ ▁BETWEEN 481
483
+ ▁BUILD 482
484
+ ▁CERTAIN 483
485
+ ▁PROBLEM 484
486
+ ▁MONEY 485
487
+ ▁BELIEVE 486
488
+ ▁SECOND 487
489
+ ▁REASON 488
490
+ ▁TOGETHER 489
491
+ ▁PUBLIC 490
492
+ ▁ANYTHING 491
493
+ ▁SPEAK 492
494
+ ▁BUSINESS 493
495
+ ▁EVERYTHING 494
496
+ ▁CLOSE 495
497
+ ▁QUITE 496
498
+ ▁ANSWER 497
499
+ ▁ENOUGH 498
500
+ Q 499
model.py ADDED
@@ -0,0 +1,585 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
2
+ #
3
+ # See LICENSE for clarification regarding multiple authors
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from huggingface_hub import hf_hub_download
18
+ from functools import lru_cache
19
+ import os
20
+
21
+ os.system(
22
+ "cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/"
23
+ )
24
+
25
+ import k2
26
+ import sherpa
27
+
28
+
29
+ sample_rate = 16000
30
+
31
+
32
+ @lru_cache(maxsize=30)
33
+ def get_pretrained_model(
34
+ repo_id: str,
35
+ decoding_method: str,
36
+ num_active_paths: int,
37
+ ) -> sherpa.OfflineRecognizer:
38
+ if repo_id in chinese_models:
39
+ return chinese_models[repo_id](
40
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
41
+ )
42
+ elif repo_id in english_models:
43
+ return english_models[repo_id](
44
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
45
+ )
46
+ elif repo_id in chinese_english_mixed_models:
47
+ return chinese_english_mixed_models[repo_id](
48
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
49
+ )
50
+ elif repo_id in tibetan_models:
51
+ return tibetan_models[repo_id](
52
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
53
+ )
54
+ elif repo_id in arabic_models:
55
+ return arabic_models[repo_id](
56
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
57
+ )
58
+ elif repo_id in german_models:
59
+ return german_models[repo_id](
60
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
61
+ )
62
+ else:
63
+ raise ValueError(f"Unsupported repo_id: {repo_id}")
64
+
65
+
66
+ def _get_nn_model_filename(
67
+ repo_id: str,
68
+ filename: str,
69
+ subfolder: str = "exp",
70
+ ) -> str:
71
+ nn_model_filename = hf_hub_download(
72
+ repo_id=repo_id,
73
+ filename=filename,
74
+ subfolder=subfolder,
75
+ )
76
+ return nn_model_filename
77
+
78
+
79
+ def _get_bpe_model_filename(
80
+ repo_id: str,
81
+ filename: str = "bpe.model",
82
+ subfolder: str = "data/lang_bpe_500",
83
+ ) -> str:
84
+ bpe_model_filename = hf_hub_download(
85
+ repo_id=repo_id,
86
+ filename=filename,
87
+ subfolder=subfolder,
88
+ )
89
+ return bpe_model_filename
90
+
91
+
92
+ def _get_token_filename(
93
+ repo_id: str,
94
+ filename: str = "tokens.txt",
95
+ subfolder: str = "data/lang_char",
96
+ ) -> str:
97
+ token_filename = hf_hub_download(
98
+ repo_id=repo_id,
99
+ filename=filename,
100
+ subfolder=subfolder,
101
+ )
102
+ return token_filename
103
+
104
+
105
+ @lru_cache(maxsize=10)
106
+ def _get_aishell2_pretrained_model(
107
+ repo_id: str,
108
+ decoding_method: str,
109
+ num_active_paths: int,
110
+ ) -> sherpa.OfflineRecognizer:
111
+ assert repo_id in [
112
+ # context-size 1
113
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12", # noqa
114
+ # context-size 2
115
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12", # noqa
116
+ ], repo_id
117
+
118
+ nn_model = _get_nn_model_filename(
119
+ repo_id=repo_id,
120
+ filename="cpu_jit.pt",
121
+ )
122
+ tokens = _get_token_filename(repo_id=repo_id)
123
+
124
+ feat_config = sherpa.FeatureConfig()
125
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
126
+ feat_config.fbank_opts.mel_opts.num_bins = 80
127
+ feat_config.fbank_opts.frame_opts.dither = 0
128
+
129
+ config = sherpa.OfflineRecognizerConfig(
130
+ nn_model=nn_model,
131
+ tokens=tokens,
132
+ use_gpu=False,
133
+ feat_config=feat_config,
134
+ decoding_method=decoding_method,
135
+ num_active_paths=num_active_paths,
136
+ )
137
+
138
+ recognizer = sherpa.OfflineRecognizer(config)
139
+
140
+ return recognizer
141
+
142
+
143
+ @lru_cache(maxsize=10)
144
+ def _get_gigaspeech_pre_trained_model(
145
+ repo_id: str,
146
+ decoding_method: str,
147
+ num_active_paths: int,
148
+ ) -> sherpa.OfflineRecognizer:
149
+ assert repo_id in [
150
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
151
+ ], repo_id
152
+
153
+ nn_model = _get_nn_model_filename(
154
+ repo_id=repo_id,
155
+ filename="cpu_jit-iter-3488000-avg-20.pt",
156
+ )
157
+ tokens = "./giga-tokens.txt"
158
+
159
+ feat_config = sherpa.FeatureConfig()
160
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
161
+ feat_config.fbank_opts.mel_opts.num_bins = 80
162
+ feat_config.fbank_opts.frame_opts.dither = 0
163
+
164
+ config = sherpa.OfflineRecognizerConfig(
165
+ nn_model=nn_model,
166
+ tokens=tokens,
167
+ use_gpu=False,
168
+ feat_config=feat_config,
169
+ decoding_method=decoding_method,
170
+ num_active_paths=num_active_paths,
171
+ )
172
+
173
+ recognizer = sherpa.OfflineRecognizer(config)
174
+
175
+ return recognizer
176
+
177
+
178
+ @lru_cache(maxsize=10)
179
+ def _get_librispeech_pre_trained_model(
180
+ repo_id: str,
181
+ decoding_method: str,
182
+ num_active_paths: int,
183
+ ) -> sherpa.OfflineRecognizer:
184
+ assert repo_id in [
185
+ "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02", # noqa
186
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", # noqa
187
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11", # noqa
188
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14", # noqa
189
+ ], repo_id
190
+
191
+ filename = "cpu_jit.pt"
192
+ if (
193
+ repo_id
194
+ == "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11"
195
+ ):
196
+ filename = "cpu_jit-torch-1.10.0.pt"
197
+
198
+ if (
199
+ repo_id
200
+ == "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02"
201
+ ):
202
+ filename = "cpu_jit-torch-1.10.pt"
203
+
204
+ nn_model = _get_nn_model_filename(
205
+ repo_id=repo_id,
206
+ filename=filename,
207
+ )
208
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
209
+
210
+ feat_config = sherpa.FeatureConfig()
211
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
212
+ feat_config.fbank_opts.mel_opts.num_bins = 80
213
+ feat_config.fbank_opts.frame_opts.dither = 0
214
+
215
+ config = sherpa.OfflineRecognizerConfig(
216
+ nn_model=nn_model,
217
+ tokens=tokens,
218
+ use_gpu=False,
219
+ feat_config=feat_config,
220
+ decoding_method=decoding_method,
221
+ num_active_paths=num_active_paths,
222
+ )
223
+
224
+ recognizer = sherpa.OfflineRecognizer(config)
225
+
226
+ return recognizer
227
+
228
+
229
+ @lru_cache(maxsize=10)
230
+ def _get_wenetspeech_pre_trained_model(
231
+ repo_id: str,
232
+ decoding_method: str,
233
+ num_active_paths: int,
234
+ ):
235
+ assert repo_id in [
236
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
237
+ ], repo_id
238
+
239
+ nn_model = _get_nn_model_filename(
240
+ repo_id=repo_id,
241
+ filename="cpu_jit_epoch_10_avg_2_torch_1.7.1.pt",
242
+ )
243
+ tokens = _get_token_filename(repo_id=repo_id)
244
+
245
+ feat_config = sherpa.FeatureConfig()
246
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
247
+ feat_config.fbank_opts.mel_opts.num_bins = 80
248
+ feat_config.fbank_opts.frame_opts.dither = 0
249
+
250
+ config = sherpa.OfflineRecognizerConfig(
251
+ nn_model=nn_model,
252
+ tokens=tokens,
253
+ use_gpu=False,
254
+ feat_config=feat_config,
255
+ decoding_method=decoding_method,
256
+ num_active_paths=num_active_paths,
257
+ )
258
+
259
+ recognizer = sherpa.OfflineRecognizer(config)
260
+
261
+ return recognizer
262
+
263
+
264
+ @lru_cache(maxsize=10)
265
+ def _get_chinese_english_mixed_model(
266
+ repo_id: str,
267
+ decoding_method: str,
268
+ num_active_paths: int,
269
+ ):
270
+ assert repo_id in [
271
+ "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
272
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
273
+ ], repo_id
274
+
275
+ if repo_id == "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5":
276
+ filename = "cpu_jit.pt"
277
+ subfolder = "data/lang_char"
278
+ elif repo_id == "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh":
279
+ filename = "cpu_jit-epoch-11-avg-1.pt"
280
+ subfolder = "data/lang_char_bpe"
281
+
282
+ nn_model = _get_nn_model_filename(
283
+ repo_id=repo_id,
284
+ filename=filename,
285
+ )
286
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=subfolder)
287
+
288
+ feat_config = sherpa.FeatureConfig()
289
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
290
+ feat_config.fbank_opts.mel_opts.num_bins = 80
291
+ feat_config.fbank_opts.frame_opts.dither = 0
292
+
293
+ config = sherpa.OfflineRecognizerConfig(
294
+ nn_model=nn_model,
295
+ tokens=tokens,
296
+ use_gpu=False,
297
+ feat_config=feat_config,
298
+ decoding_method=decoding_method,
299
+ num_active_paths=num_active_paths,
300
+ )
301
+
302
+ recognizer = sherpa.OfflineRecognizer(config)
303
+
304
+ return recognizer
305
+
306
+
307
+ @lru_cache(maxsize=10)
308
+ def _get_alimeeting_pre_trained_model(
309
+ repo_id: str,
310
+ decoding_method: str,
311
+ num_active_paths: int,
312
+ ):
313
+ assert repo_id in [
314
+ "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
315
+ ], repo_id
316
+
317
+ nn_model = _get_nn_model_filename(
318
+ repo_id=repo_id,
319
+ filename="cpu_jit_torch_1.7.1.pt",
320
+ )
321
+ tokens = _get_token_filename(repo_id=repo_id)
322
+
323
+ feat_config = sherpa.FeatureConfig()
324
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
325
+ feat_config.fbank_opts.mel_opts.num_bins = 80
326
+ feat_config.fbank_opts.frame_opts.dither = 0
327
+
328
+ config = sherpa.OfflineRecognizerConfig(
329
+ nn_model=nn_model,
330
+ tokens=tokens,
331
+ use_gpu=False,
332
+ feat_config=feat_config,
333
+ decoding_method=decoding_method,
334
+ num_active_paths=num_active_paths,
335
+ )
336
+
337
+ recognizer = sherpa.OfflineRecognizer(config)
338
+
339
+ return recognizer
340
+
341
+
342
+ @lru_cache(maxsize=10)
343
+ def _get_wenet_model(
344
+ repo_id: str,
345
+ decoding_method: str,
346
+ num_active_paths: int,
347
+ ):
348
+ assert repo_id in [
349
+ "csukuangfj/wenet-chinese-model",
350
+ "csukuangfj/wenet-english-model",
351
+ ], repo_id
352
+
353
+ nn_model = _get_nn_model_filename(
354
+ repo_id=repo_id,
355
+ filename="final.zip",
356
+ subfolder=".",
357
+ )
358
+ tokens = _get_token_filename(
359
+ repo_id=repo_id,
360
+ filename="units.txt",
361
+ subfolder=".",
362
+ )
363
+
364
+ feat_config = sherpa.FeatureConfig(normalize_samples=False)
365
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
366
+ feat_config.fbank_opts.mel_opts.num_bins = 80
367
+ feat_config.fbank_opts.frame_opts.dither = 0
368
+
369
+ config = sherpa.OfflineRecognizerConfig(
370
+ nn_model=nn_model,
371
+ tokens=tokens,
372
+ use_gpu=False,
373
+ feat_config=feat_config,
374
+ decoding_method=decoding_method,
375
+ num_active_paths=num_active_paths,
376
+ )
377
+
378
+ recognizer = sherpa.OfflineRecognizer(config)
379
+
380
+ return recognizer
381
+
382
+
383
+ @lru_cache(maxsize=10)
384
+ def _get_aidatatang_200zh_pretrained_mode(
385
+ repo_id: str,
386
+ decoding_method: str,
387
+ num_active_paths: int,
388
+ ):
389
+ assert repo_id in [
390
+ "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
391
+ ], repo_id
392
+
393
+ nn_model = _get_nn_model_filename(
394
+ repo_id=repo_id,
395
+ filename="cpu_jit_torch.1.7.1.pt",
396
+ )
397
+ tokens = _get_token_filename(repo_id=repo_id)
398
+
399
+ feat_config = sherpa.FeatureConfig()
400
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
401
+ feat_config.fbank_opts.mel_opts.num_bins = 80
402
+ feat_config.fbank_opts.frame_opts.dither = 0
403
+
404
+ config = sherpa.OfflineRecognizerConfig(
405
+ nn_model=nn_model,
406
+ tokens=tokens,
407
+ use_gpu=False,
408
+ feat_config=feat_config,
409
+ decoding_method=decoding_method,
410
+ num_active_paths=num_active_paths,
411
+ )
412
+
413
+ recognizer = sherpa.OfflineRecognizer(config)
414
+
415
+ return recognizer
416
+
417
+
418
+ @lru_cache(maxsize=10)
419
+ def _get_tibetan_pre_trained_model(
420
+ repo_id: str,
421
+ decoding_method: str,
422
+ num_active_paths: int,
423
+ ):
424
+ assert repo_id in [
425
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
426
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29",
427
+ ], repo_id
428
+
429
+ filename = "cpu_jit.pt"
430
+ if (
431
+ repo_id
432
+ == "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29"
433
+ ):
434
+ filename = "cpu_jit-epoch-28-avg-23-torch-1.10.0.pt"
435
+
436
+ nn_model = _get_nn_model_filename(
437
+ repo_id=repo_id,
438
+ filename=filename,
439
+ )
440
+
441
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
442
+
443
+ feat_config = sherpa.FeatureConfig()
444
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
445
+ feat_config.fbank_opts.mel_opts.num_bins = 80
446
+ feat_config.fbank_opts.frame_opts.dither = 0
447
+
448
+ config = sherpa.OfflineRecognizerConfig(
449
+ nn_model=nn_model,
450
+ tokens=tokens,
451
+ use_gpu=False,
452
+ feat_config=feat_config,
453
+ decoding_method=decoding_method,
454
+ num_active_paths=num_active_paths,
455
+ )
456
+
457
+ recognizer = sherpa.OfflineRecognizer(config)
458
+
459
+ return recognizer
460
+
461
+
462
+ @lru_cache(maxsize=10)
463
+ def _get_arabic_pre_trained_model(
464
+ repo_id: str,
465
+ decoding_method: str,
466
+ num_active_paths: int,
467
+ ):
468
+ assert repo_id in [
469
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
470
+ ], repo_id
471
+
472
+ nn_model = _get_nn_model_filename(
473
+ repo_id=repo_id,
474
+ filename="cpu_jit.pt",
475
+ )
476
+
477
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_5000")
478
+
479
+ feat_config = sherpa.FeatureConfig()
480
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
481
+ feat_config.fbank_opts.mel_opts.num_bins = 80
482
+ feat_config.fbank_opts.frame_opts.dither = 0
483
+
484
+ config = sherpa.OfflineRecognizerConfig(
485
+ nn_model=nn_model,
486
+ tokens=tokens,
487
+ use_gpu=False,
488
+ feat_config=feat_config,
489
+ decoding_method=decoding_method,
490
+ num_active_paths=num_active_paths,
491
+ )
492
+
493
+ recognizer = sherpa.OfflineRecognizer(config)
494
+
495
+ return recognizer
496
+
497
+
498
+ @lru_cache(maxsize=10)
499
+ def _get_german_pre_trained_model(
500
+ repo_id: str,
501
+ decoding_method: str,
502
+ num_active_paths: int,
503
+ ):
504
+ assert repo_id in [
505
+ "csukuangfj/wav2vec2.0-torchaudio",
506
+ ], repo_id
507
+
508
+ nn_model = _get_nn_model_filename(
509
+ repo_id=repo_id,
510
+ filename="voxpopuli_asr_base_10k_de.pt",
511
+ subfolder=".",
512
+ )
513
+
514
+ tokens = _get_token_filename(
515
+ repo_id=repo_id,
516
+ filename="tokens-de.txt",
517
+ subfolder=".",
518
+ )
519
+
520
+ config = sherpa.OfflineRecognizerConfig(
521
+ nn_model=nn_model,
522
+ tokens=tokens,
523
+ use_gpu=False,
524
+ decoding_method=decoding_method,
525
+ num_active_paths=num_active_paths,
526
+ )
527
+
528
+ recognizer = sherpa.OfflineRecognizer(config)
529
+
530
+ return recognizer
531
+
532
+
533
+ chinese_models = {
534
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model, # noqa
535
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12": _get_aishell2_pretrained_model, # noqa
536
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12": _get_aishell2_pretrained_model, # noqa
537
+ "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2": _get_aidatatang_200zh_pretrained_mode, # noqa
538
+ "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": _get_alimeeting_pre_trained_model, # noqa
539
+ "csukuangfj/wenet-chinese-model": _get_wenet_model,
540
+ }
541
+
542
+ english_models = {
543
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2": _get_gigaspeech_pre_trained_model, # noqa
544
+ "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02": _get_librispeech_pre_trained_model, # noqa
545
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14": _get_librispeech_pre_trained_model, # noqa
546
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11": _get_librispeech_pre_trained_model, # noqa
547
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13": _get_librispeech_pre_trained_model, # noqa
548
+ "csukuangfj/wenet-english-model": _get_wenet_model,
549
+ }
550
+
551
+ chinese_english_mixed_models = {
552
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh": _get_chinese_english_mixed_model,
553
+ "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5": _get_chinese_english_mixed_model, # noqa
554
+ }
555
+
556
+ tibetan_models = {
557
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02": _get_tibetan_pre_trained_model, # noqa
558
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29": _get_tibetan_pre_trained_model, # noqa
559
+ }
560
+
561
+ arabic_models = {
562
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06": _get_arabic_pre_trained_model, # noqa
563
+ }
564
+
565
+ german_models = {
566
+ "csukuangfj/wav2vec2.0-torchaudio": _get_german_pre_trained_model,
567
+ }
568
+
569
+ all_models = {
570
+ **chinese_models,
571
+ **english_models,
572
+ **chinese_english_mixed_models,
573
+ **tibetan_models,
574
+ **arabic_models,
575
+ **german_models,
576
+ }
577
+
578
+ language_to_models = {
579
+ "Chinese": list(chinese_models.keys()),
580
+ "English": list(english_models.keys()),
581
+ "Chinese+English": list(chinese_english_mixed_models.keys()),
582
+ "Tibetan": list(tibetan_models.keys()),
583
+ "Arabic": list(arabic_models.keys()),
584
+ "German": list(german_models.keys()),
585
+ }
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://download.pytorch.org/whl/cpu/torch-1.13.0%2Bcpu-cp38-cp38-linux_x86_64.whl
2
+ https://download.pytorch.org/whl/cpu/torchaudio-0.13.0%2Bcpu-cp38-cp38-linux_x86_64.whl
3
+
4
+ https://huggingface.co/csukuangfj/wheels/resolve/main/k2-1.23.2.dev20221204%2Bcpu.torch1.13.0-cp38-cp38-linux_x86_64.whl
5
+ https://huggingface.co/csukuangfj/wheels/resolve/main/k2_sherpa-1.1-cp38-cp38-linux_x86_64.whl
6
+ https://huggingface.co/csukuangfj/wheels/resolve/main/kaldifeat-1.22-cp38-cp38-linux_x86_64.whl
7
+
8
+ sentencepiece>=0.1.96
9
+ numpy
10
+
11
+ huggingface_hub
test_wavs/aidatatang_200zh/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Files are downloaded from
2
+ https://huggingface.co/luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2/tree/main/test_wavs
test_wavs/aidatatang_200zh/T0055G0036S0002.wav ADDED
Binary file (67.6 kB). View file
 
test_wavs/aidatatang_200zh/T0055G0036S0003.wav ADDED
Binary file (94.2 kB). View file
 
test_wavs/aidatatang_200zh/T0055G0036S0004.wav ADDED
Binary file (70.5 kB). View file
 
test_wavs/aishell2/ID0012W0030.wav ADDED
Binary file (113 kB). View file
 
test_wavs/aishell2/ID0012W0162.wav ADDED
Binary file (114 kB). View file
 
test_wavs/aishell2/ID0012W0215.wav ADDED
Binary file (104 kB). View file
 
test_wavs/aishell2/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Files are downloaded from
2
+ https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12/tree/main/test_wavs
test_wavs/aishell2/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ID0012W0162 立法机关采纳了第二种意见
2
+ ID0012W0215 大家都愿意牺牲自己的生命
3
+ ID0012W0030 完全是典型的军事侵略
test_wavs/arabic/a.wav ADDED
Binary file (253 kB). View file
 
test_wavs/arabic/b.wav ADDED
Binary file (243 kB). View file
 
test_wavs/arabic/c.wav ADDED
Binary file (150 kB). View file
 
test_wavs/arabic/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0053813:0054281 بعد أن عجز وبدأ يصدر مشكلات شعبه ومشكلات مصر
2
+ 94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0051454:0052244 وهؤلاء أولياء الشيطان ها هو ذا أحدهم الآن ضيفا عليكم على قناة الجزيرة ولا يستحي في ذلك
3
+ 94D37D38-B203-4FC0-9F3A-538F5C174920_spk-0001_seg-0052244:0053004 عندما استغاث الليبيون بالعالم استغاثوا لرفع الظلم وليس لقهر إرادة الأمة ومصادرة الحياة الدستورية
test_wavs/german/20120315-0900-PLENARY-14-de_20120315.wav ADDED
Binary file (381 kB). View file
 
test_wavs/german/20170517-0900-PLENARY-16-de_20170517.wav ADDED
Binary file (282 kB). View file
 
test_wavs/gigaspeech/1-minute-audiobook.opus ADDED
Binary file (580 kB). View file
 
test_wavs/gigaspeech/100-seconds-podcast.opus ADDED
Binary file (955 kB). View file
 
test_wavs/gigaspeech/100-seconds-youtube.opus ADDED
Binary file (948 kB). View file
 
test_wavs/librispeech/1089-134686-0001.wav ADDED
Binary file (212 kB). View file
 
test_wavs/librispeech/1221-135766-0001.wav ADDED
Binary file (535 kB). View file
 
test_wavs/librispeech/1221-135766-0002.wav ADDED
Binary file (154 kB). View file
 
test_wavs/librispeech/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Files are downloaded from
2
+ https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13/tree/main/test_wavs
test_wavs/librispeech/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 1089-134686-0001 AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
2
+ 1221-135766-0001 GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
3
+ 1221-135766-0002 YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
test_wavs/tal_csasr/0.wav ADDED
Binary file (259 kB). View file
 
test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_132.wav ADDED
Binary file (163 kB). View file
 
test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_138.wav ADDED
Binary file (150 kB). View file
 
test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav ADDED
Binary file (283 kB). View file
 
test_wavs/tal_csasr/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Files are downloaded from
2
+ https://huggingface.co/luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5/tree/main/test_wavs
test_wavs/tibetan/a_0_cacm-A70_31116.wav ADDED
Binary file (97.4 kB). View file
 
test_wavs/tibetan/a_0_cacm-A70_31117.wav ADDED
Binary file (128 kB). View file
 
test_wavs/tibetan/a_0_cacm-A70_31118.wav ADDED
Binary file (87.1 kB). View file
 
test_wavs/tibetan/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ a_0_cacm-A70_31116.wav ལོ བཅུ ཙམ མ འདང བའི དུས སྐབས ནང
2
+ a_0_cacm-A70_31117.wav དྲག པོའི ངོ ལོག ཟིང འཁྲུག སྒྲིག འཛུགས དང ངན བཀོད བྱས ཡོད
3
+ a_0_cacm-A70_31118.wav གནས བབ འདིའི རིགས གང མགྱོགས འགྱུར བ གཏོང དགོས
test_wavs/wenetspeech/DEV_T0000000000.opus ADDED
Binary file (23.1 kB). View file
 
test_wavs/wenetspeech/DEV_T0000000001.opus ADDED
Binary file (21.5 kB). View file
 
test_wavs/wenetspeech/DEV_T0000000002.opus ADDED
Binary file (18.8 kB). View file
 
test_wavs/wenetspeech/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Files are downloaded from
2
+ https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2/tree/main/test_wavs