IhebJettPilot commited on
Commit
c293cf4
1 Parent(s): 49280c9

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +385 -0
  2. decode.py +121 -0
  3. giga-tokens.txt +500 -0
  4. model.py +1001 -0
  5. requirements (1).txt +12 -0
app.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ #
3
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
4
+ #
5
+ # See LICENSE for clarification regarding multiple authors
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ # References:
20
+ # https://gradio.app/docs/#dropdown
21
+
22
+ import logging
23
+ import os
24
+ import tempfile
25
+ import time
26
+ from datetime import datetime
27
+
28
+ import gradio as gr
29
+ import torch
30
+ import torchaudio
31
+ import urllib.request
32
+
33
+
34
+ from examples import examples
35
+ from model import decode, get_pretrained_model, language_to_models, sample_rate
36
+
37
+ languages = list(language_to_models.keys())
38
+
39
+
40
+ def convert_to_wav(in_filename: str) -> str:
41
+ """Convert the input audio file to a wave file"""
42
+ out_filename = in_filename + ".wav"
43
+ logging.info(f"Converting '{in_filename}' to '{out_filename}'")
44
+ _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 '{out_filename}'")
45
+ _ = os.system(
46
+ f"ffmpeg -hide_banner -loglevel error -i '{in_filename}' -ar 16000 '{out_filename}.flac'"
47
+ )
48
+
49
+ return out_filename
50
+
51
+
52
+ def build_html_output(s: str, style: str = "result_item_success"):
53
+ return f"""
54
+ <div class='result'>
55
+ <div class='result_item {style}'>
56
+ {s}
57
+ </div>
58
+ </div>
59
+ """
60
+
61
+ def process_url(
62
+ language: str,
63
+ repo_id: str,
64
+ decoding_method: str,
65
+ num_active_paths: int,
66
+ url: str,
67
+ ):
68
+ logging.info(f"Processing URL: {url}")
69
+ with tempfile.NamedTemporaryFile() as f:
70
+ try:
71
+ urllib.request.urlretrieve(url, f.name)
72
+
73
+ return process(
74
+ in_filename=f.name,
75
+ language=language,
76
+ repo_id=repo_id,
77
+ decoding_method=decoding_method,
78
+ num_active_paths=num_active_paths,
79
+ )
80
+ except Exception as e:
81
+ logging.info(str(e))
82
+ return "", build_html_output(str(e), "result_item_error")
83
+
84
+ def process_uploaded_file(
85
+ language: str,
86
+ repo_id: str,
87
+ decoding_method: str,
88
+ num_active_paths: int,
89
+ in_filename: str,
90
+ ):
91
+ if in_filename is None or in_filename == "":
92
+ return "", build_html_output(
93
+ "Please first upload a file and then click "
94
+ 'the button "submit for recognition"',
95
+ "result_item_error",
96
+ )
97
+
98
+ logging.info(f"Processing uploaded file: {in_filename}")
99
+ try:
100
+ return process(
101
+ in_filename=in_filename,
102
+ language=language,
103
+ repo_id=repo_id,
104
+ decoding_method=decoding_method,
105
+ num_active_paths=num_active_paths,
106
+ )
107
+ except Exception as e:
108
+ logging.info(str(e))
109
+ return "", build_html_output(str(e), "result_item_error")
110
+
111
+
112
+ def process_microphone(
113
+ language: str,
114
+ repo_id: str,
115
+ decoding_method: str,
116
+ num_active_paths: int,
117
+ in_filename: str,
118
+ ):
119
+ if in_filename is None or in_filename == "":
120
+ return "", build_html_output(
121
+ "Please first click 'Record from microphone', speak, "
122
+ "click 'Stop recording', and then "
123
+ "click the button 'submit for recognition'",
124
+ "result_item_error",
125
+ )
126
+
127
+ logging.info(f"Processing microphone: {in_filename}")
128
+ try:
129
+ return process(
130
+ in_filename=in_filename,
131
+ language=language,
132
+ repo_id=repo_id,
133
+ decoding_method=decoding_method,
134
+ num_active_paths=num_active_paths,
135
+ )
136
+ except Exception as e:
137
+ logging.info(str(e))
138
+ return "", build_html_output(str(e), "result_item_error")
139
+
140
+
141
+ @torch.no_grad()
142
+ def process(
143
+ language: str,
144
+ repo_id: str,
145
+ decoding_method: str,
146
+ num_active_paths: int,
147
+ in_filename: str,
148
+ ):
149
+ logging.info(f"language: {language}")
150
+ logging.info(f"repo_id: {repo_id}")
151
+ logging.info(f"decoding_method: {decoding_method}")
152
+ logging.info(f"num_active_paths: {num_active_paths}")
153
+ logging.info(f"in_filename: {in_filename}")
154
+
155
+ filename = convert_to_wav(in_filename)
156
+
157
+ now = datetime.now()
158
+ date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
159
+ logging.info(f"Started at {date_time}")
160
+
161
+ start = time.time()
162
+
163
+ recognizer = get_pretrained_model(
164
+ repo_id,
165
+ decoding_method=decoding_method,
166
+ num_active_paths=num_active_paths,
167
+ )
168
+
169
+ text = decode(recognizer, filename)
170
+
171
+ date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
172
+ end = time.time()
173
+
174
+ metadata = torchaudio.info(filename)
175
+ duration = metadata.num_frames / sample_rate
176
+ rtf = (end - start) / duration
177
+
178
+ logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
179
+
180
+ info = f"""
181
+ Wave duration : {duration: .3f} s <br/>
182
+ Processing time: {end - start: .3f} s <br/>
183
+ RTF: {end - start: .3f}/{duration: .3f} = {rtf:.3f} <br/>
184
+ """
185
+ if rtf > 1:
186
+ info += (
187
+ "<br/>We are loading the model for the first run. "
188
+ "Please run again to measure the real RTF.<br/>"
189
+ )
190
+
191
+ logging.info(info)
192
+ logging.info(f"\nrepo_id: {repo_id}\nhyp: {text}")
193
+
194
+ return text, build_html_output(info)
195
+
196
+
197
+ title = "# Automatic Speech Recognition with Next-gen Kaldi"
198
+ description = """
199
+ This space shows how to do automatic speech recognition with Next-gen Kaldi.
200
+
201
+ Please visit
202
+ <https://huggingface.co/spaces/k2-fsa/streaming-automatic-speech-recognition>
203
+ for streaming speech recognition with **Next-gen Kaldi**.
204
+
205
+ It is running on CPU within a docker container provided by Hugging Face.
206
+
207
+ See more information by visiting the following links:
208
+
209
+ - <https://github.com/k2-fsa/icefall>
210
+ - <https://github.com/k2-fsa/sherpa>
211
+ - <https://github.com/k2-fsa/k2>
212
+ - <https://github.com/lhotse-speech/lhotse>
213
+
214
+ If you want to deploy it locally, please see
215
+ <https://k2-fsa.github.io/sherpa/>
216
+ """
217
+
218
+ # css style is copied from
219
+ # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
220
+ css = """
221
+ .result {display:flex;flex-direction:column}
222
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
223
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
224
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
225
+ """
226
+
227
+
228
+ def update_model_dropdown(language: str):
229
+ if language in language_to_models:
230
+ choices = language_to_models[language]
231
+ return gr.Dropdown.update(choices=choices, value=choices[0])
232
+
233
+ raise ValueError(f"Unsupported language: {language}")
234
+
235
+
236
+ demo = gr.Blocks(css=css)
237
+
238
+
239
+ with demo:
240
+ gr.Markdown(title)
241
+ language_choices = list(language_to_models.keys())
242
+
243
+ language_radio = gr.Radio(
244
+ label="Language",
245
+ choices=language_choices,
246
+ value=language_choices[0],
247
+ )
248
+ model_dropdown = gr.Dropdown(
249
+ choices=language_to_models[language_choices[0]],
250
+ label="Select a model",
251
+ value=language_to_models[language_choices[0]][0],
252
+ )
253
+
254
+ language_radio.change(
255
+ update_model_dropdown,
256
+ inputs=language_radio,
257
+ outputs=model_dropdown,
258
+ )
259
+
260
+ decoding_method_radio = gr.Radio(
261
+ label="Decoding method",
262
+ choices=["greedy_search", "modified_beam_search"],
263
+ value="greedy_search",
264
+ )
265
+
266
+ num_active_paths_slider = gr.Slider(
267
+ minimum=1,
268
+ value=4,
269
+ step=1,
270
+ label="Number of active paths for modified_beam_search",
271
+ )
272
+
273
+ with gr.Tabs():
274
+ with gr.TabItem("Upload from disk"):
275
+ uploaded_file = gr.Audio(
276
+ source="upload", # Choose between "microphone", "upload"
277
+ type="filepath",
278
+ optional=False,
279
+ label="Upload from disk",
280
+ )
281
+ upload_button = gr.Button("Submit for recognition")
282
+ uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
283
+ uploaded_html_info = gr.HTML(label="Info")
284
+
285
+ gr.Examples(
286
+ examples=examples,
287
+ inputs=[
288
+ language_radio,
289
+ model_dropdown,
290
+ decoding_method_radio,
291
+ num_active_paths_slider,
292
+ uploaded_file,
293
+ ],
294
+ outputs=[uploaded_output, uploaded_html_info],
295
+ fn=process_uploaded_file,
296
+ )
297
+
298
+ with gr.TabItem("Record from microphone"):
299
+ microphone = gr.Audio(
300
+ source="microphone", # Choose between "microphone", "upload"
301
+ type="filepath",
302
+ optional=False,
303
+ label="Record from microphone",
304
+ )
305
+
306
+ record_button = gr.Button("Submit for recognition")
307
+ recorded_output = gr.Textbox(label="Recognized speech from recordings")
308
+ recorded_html_info = gr.HTML(label="Info")
309
+
310
+ gr.Examples(
311
+ examples=examples,
312
+ inputs=[
313
+ language_radio,
314
+ model_dropdown,
315
+ decoding_method_radio,
316
+ num_active_paths_slider,
317
+ microphone,
318
+ ],
319
+ outputs=[recorded_output, recorded_html_info],
320
+ fn=process_microphone,
321
+ )
322
+
323
+ with gr.TabItem("From URL"):
324
+ url_textbox = gr.Textbox(
325
+ max_lines=1,
326
+ placeholder="URL to an audio file",
327
+ label="URL",
328
+ interactive=True,
329
+ )
330
+
331
+ url_button = gr.Button("Submit for recognition")
332
+ url_output = gr.Textbox(label="Recognized speech from URL")
333
+ url_html_info = gr.HTML(label="Info")
334
+
335
+ upload_button.click(
336
+ process_uploaded_file,
337
+ inputs=[
338
+ language_radio,
339
+ model_dropdown,
340
+ decoding_method_radio,
341
+ num_active_paths_slider,
342
+ uploaded_file,
343
+ ],
344
+ outputs=[uploaded_output, uploaded_html_info],
345
+ )
346
+
347
+ record_button.click(
348
+ process_microphone,
349
+ inputs=[
350
+ language_radio,
351
+ model_dropdown,
352
+ decoding_method_radio,
353
+ num_active_paths_slider,
354
+ microphone,
355
+ ],
356
+ outputs=[recorded_output, recorded_html_info],
357
+ )
358
+
359
+ url_button.click(
360
+ process_url,
361
+ inputs=[
362
+ language_radio,
363
+ model_dropdown,
364
+ decoding_method_radio,
365
+ num_active_paths_slider,
366
+ url_textbox,
367
+ ],
368
+ outputs=[url_output, url_html_info],
369
+ )
370
+
371
+ gr.Markdown(description)
372
+
373
+ torch.set_num_threads(1)
374
+ torch.set_num_interop_threads(1)
375
+
376
+ torch._C._jit_set_profiling_executor(False)
377
+ torch._C._jit_set_profiling_mode(False)
378
+ torch._C._set_graph_executor_optimize(False)
379
+
380
+ if __name__ == "__main__":
381
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
382
+
383
+ logging.basicConfig(format=formatter, level=logging.INFO)
384
+
385
+ demo.launch()
decode.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
2
+ #
3
+ # Copied from https://github.com/k2-fsa/sherpa/blob/master/sherpa/bin/conformer_rnnt/decode.py
4
+ #
5
+ # See LICENSE for clarification regarding multiple authors
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ import math
20
+ from typing import List
21
+
22
+ import torch
23
+ from sherpa import RnntConformerModel, greedy_search, modified_beam_search
24
+ from torch.nn.utils.rnn import pad_sequence
25
+
26
+ LOG_EPS = math.log(1e-10)
27
+
28
+
29
+ @torch.no_grad()
30
+ def run_model_and_do_greedy_search(
31
+ model: RnntConformerModel,
32
+ features: List[torch.Tensor],
33
+ ) -> List[List[int]]:
34
+ """Run RNN-T model with the given features and use greedy search
35
+ to decode the output of the model.
36
+
37
+ Args:
38
+ model:
39
+ The RNN-T model.
40
+ features:
41
+ A list of 2-D tensors. Each entry is of shape
42
+ (num_frames, feature_dim).
43
+ Returns:
44
+ Return a list-of-list containing the decoding token IDs.
45
+ """
46
+ features_length = torch.tensor(
47
+ [f.size(0) for f in features],
48
+ dtype=torch.int64,
49
+ )
50
+ features = pad_sequence(
51
+ features,
52
+ batch_first=True,
53
+ padding_value=LOG_EPS,
54
+ )
55
+
56
+ device = model.device
57
+ features = features.to(device)
58
+ features_length = features_length.to(device)
59
+
60
+ encoder_out, encoder_out_length = model.encoder(
61
+ features=features,
62
+ features_length=features_length,
63
+ )
64
+
65
+ hyp_tokens = greedy_search(
66
+ model=model,
67
+ encoder_out=encoder_out,
68
+ encoder_out_length=encoder_out_length.cpu(),
69
+ )
70
+ return hyp_tokens
71
+
72
+
73
+ @torch.no_grad()
74
+ def run_model_and_do_modified_beam_search(
75
+ model: RnntConformerModel,
76
+ features: List[torch.Tensor],
77
+ num_active_paths: int,
78
+ ) -> List[List[int]]:
79
+ """Run RNN-T model with the given features and use greedy search
80
+ to decode the output of the model.
81
+
82
+ Args:
83
+ model:
84
+ The RNN-T model.
85
+ features:
86
+ A list of 2-D tensors. Each entry is of shape
87
+ (num_frames, feature_dim).
88
+ num_active_paths:
89
+ Used only when decoding_method is modified_beam_search.
90
+ It specifies number of active paths for each utterance. Due to
91
+ merging paths with identical token sequences, the actual number
92
+ may be less than "num_active_paths".
93
+ Returns:
94
+ Return a list-of-list containing the decoding token IDs.
95
+ """
96
+ features_length = torch.tensor(
97
+ [f.size(0) for f in features],
98
+ dtype=torch.int64,
99
+ )
100
+ features = pad_sequence(
101
+ features,
102
+ batch_first=True,
103
+ padding_value=LOG_EPS,
104
+ )
105
+
106
+ device = model.device
107
+ features = features.to(device)
108
+ features_length = features_length.to(device)
109
+
110
+ encoder_out, encoder_out_length = model.encoder(
111
+ features=features,
112
+ features_length=features_length,
113
+ )
114
+
115
+ hyp_tokens = modified_beam_search(
116
+ model=model,
117
+ encoder_out=encoder_out,
118
+ encoder_out_length=encoder_out_length.cpu(),
119
+ num_active_paths=num_active_paths,
120
+ )
121
+ return hyp_tokens
giga-tokens.txt ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <blk> 0
2
+ <sos/eos> 1
3
+ <unk> 2
4
+ S 3
5
+ T 4
6
+ ▁THE 5
7
+ ▁A 6
8
+ E 7
9
+ ▁AND 8
10
+ ▁TO 9
11
+ N 10
12
+ D 11
13
+ ▁OF 12
14
+ ' 13
15
+ ING 14
16
+ ▁I 15
17
+ Y 16
18
+ ▁IN 17
19
+ ED 18
20
+ ▁THAT 19
21
+ ▁ 20
22
+ P 21
23
+ R 22
24
+ ▁YOU 23
25
+ M 24
26
+ RE 25
27
+ ER 26
28
+ C 27
29
+ O 28
30
+ ▁IT 29
31
+ L 30
32
+ A 31
33
+ U 32
34
+ G 33
35
+ ▁WE 34
36
+ ▁IS 35
37
+ ▁SO 36
38
+ AL 37
39
+ I 38
40
+ ▁S 39
41
+ ▁RE 40
42
+ AR 41
43
+ B 42
44
+ ▁FOR 43
45
+ ▁C 44
46
+ ▁BE 45
47
+ LE 46
48
+ F 47
49
+ W 48
50
+ ▁E 49
51
+ ▁HE 50
52
+ LL 51
53
+ ▁WAS 52
54
+ LY 53
55
+ OR 54
56
+ IN 55
57
+ ▁F 56
58
+ VE 57
59
+ ▁THIS 58
60
+ TH 59
61
+ K 60
62
+ ▁ON 61
63
+ IT 62
64
+ ▁B 63
65
+ ▁WITH 64
66
+ ▁BUT 65
67
+ EN 66
68
+ CE 67
69
+ RI 68
70
+ ▁DO 69
71
+ UR 70
72
+ ▁HAVE 71
73
+ ▁DE 72
74
+ ▁ME 73
75
+ ▁T 74
76
+ ENT 75
77
+ CH 76
78
+ ▁THEY 77
79
+ ▁NOT 78
80
+ ES 79
81
+ V 80
82
+ ▁AS 81
83
+ RA 82
84
+ ▁P 83
85
+ ON 84
86
+ TER 85
87
+ ▁ARE 86
88
+ ▁WHAT 87
89
+ IC 88
90
+ ▁ST 89
91
+ ▁LIKE 90
92
+ ATION 91
93
+ ▁OR 92
94
+ ▁CA 93
95
+ ▁AT 94
96
+ H 95
97
+ ▁KNOW 96
98
+ ▁G 97
99
+ AN 98
100
+ ▁CON 99
101
+ IL 100
102
+ ND 101
103
+ RO 102
104
+ ▁HIS 103
105
+ ▁CAN 104
106
+ ▁ALL 105
107
+ TE 106
108
+ ▁THERE 107
109
+ ▁SU 108
110
+ ▁MO 109
111
+ ▁MA 110
112
+ LI 111
113
+ ▁ONE 112
114
+ ▁ABOUT 113
115
+ LA 114
116
+ ▁CO 115
117
+ - 116
118
+ ▁MY 117
119
+ ▁HAD 118
120
+ CK 119
121
+ NG 120
122
+ ▁NO 121
123
+ MENT 122
124
+ AD 123
125
+ LO 124
126
+ ME 125
127
+ ▁AN 126
128
+ ▁FROM 127
129
+ NE 128
130
+ ▁IF 129
131
+ VER 130
132
+ ▁JUST 131
133
+ ▁PRO 132
134
+ ION 133
135
+ ▁PA 134
136
+ ▁WHO 135
137
+ ▁SE 136
138
+ EL 137
139
+ IR 138
140
+ ▁US 139
141
+ ▁UP 140
142
+ ▁YOUR 141
143
+ CI 142
144
+ RY 143
145
+ ▁GO 144
146
+ ▁SHE 145
147
+ ▁LE 146
148
+ ▁OUT 147
149
+ ▁PO 148
150
+ ▁HO 149
151
+ ATE 150
152
+ ▁BO 151
153
+ ▁BY 152
154
+ ▁FA 153
155
+ ▁MI 154
156
+ AS 155
157
+ MP 156
158
+ ▁HER 157
159
+ VI 158
160
+ ▁THINK 159
161
+ ▁SOME 160
162
+ ▁WHEN 161
163
+ ▁AH 162
164
+ ▁PEOPLE 163
165
+ IG 164
166
+ ▁WA 165
167
+ ▁TE 166
168
+ ▁LA 167
169
+ ▁WERE 168
170
+ ▁LI 169
171
+ ▁WOULD 170
172
+ ▁SEE 171
173
+ ▁WHICH 172
174
+ DE 173
175
+ GE 174
176
+ ▁K 175
177
+ IGHT 176
178
+ ▁HA 177
179
+ ▁OUR 178
180
+ UN 179
181
+ ▁HOW 180
182
+ ▁GET 181
183
+ IS 182
184
+ UT 183
185
+ Z 184
186
+ CO 185
187
+ ET 186
188
+ UL 187
189
+ IES 188
190
+ IVE 189
191
+ AT 190
192
+ ▁O 191
193
+ ▁DON 192
194
+ LU 193
195
+ ▁TIME 194
196
+ ▁WILL 195
197
+ ▁MORE 196
198
+ ▁SP 197
199
+ ▁NOW 198
200
+ RU 199
201
+ ▁THEIR 200
202
+ ▁UN 201
203
+ ITY 202
204
+ OL 203
205
+ X 204
206
+ TI 205
207
+ US 206
208
+ ▁VERY 207
209
+ TION 208
210
+ ▁FI 209
211
+ ▁SAY 210
212
+ ▁BECAUSE 211
213
+ ▁EX 212
214
+ ▁RO 213
215
+ ERS 214
216
+ IST 215
217
+ ▁DA 216
218
+ TING 217
219
+ ▁EN 218
220
+ OM 219
221
+ ▁BA 220
222
+ ▁BEEN 221
223
+ ▁LO 222
224
+ ▁UM 223
225
+ AGE 224
226
+ ABLE 225
227
+ ▁WO 226
228
+ ▁RA 227
229
+ ▁OTHER 228
230
+ ▁REALLY 229
231
+ ENCE 230
232
+ ▁GOING 231
233
+ ▁HIM 232
234
+ ▁HAS 233
235
+ ▁THEM 234
236
+ ▁DIS 235
237
+ ▁WANT 236
238
+ ID 237
239
+ TA 238
240
+ ▁LOOK 239
241
+ KE 240
242
+ ▁DID 241
243
+ ▁SA 242
244
+ ▁VI 243
245
+ ▁SAID 244
246
+ ▁RIGHT 245
247
+ ▁THESE 246
248
+ ▁WORK 247
249
+ ▁COM 248
250
+ ALLY 249
251
+ FF 250
252
+ QU 251
253
+ AC 252
254
+ ▁DR 253
255
+ ▁WAY 254
256
+ ▁INTO 255
257
+ MO 256
258
+ TED 257
259
+ EST 258
260
+ ▁HERE 259
261
+ OK 260
262
+ ▁COULD 261
263
+ ▁WELL 262
264
+ MA 263
265
+ ▁PRE 264
266
+ ▁DI 265
267
+ MAN 266
268
+ ▁COMP 267
269
+ ▁THEN 268
270
+ IM 269
271
+ ▁PER 270
272
+ ▁NA 271
273
+ ▁WHERE 272
274
+ ▁TWO 273
275
+ ▁WI 274
276
+ ▁FE 275
277
+ INE 276
278
+ ▁ANY 277
279
+ TURE 278
280
+ ▁OVER 279
281
+ BO 280
282
+ ACH 281
283
+ OW 282
284
+ ▁MAKE 283
285
+ ▁TRA 284
286
+ HE 285
287
+ UND 286
288
+ ▁EVEN 287
289
+ ANCE 288
290
+ ▁YEAR 289
291
+ HO 290
292
+ AM 291
293
+ ▁CHA 292
294
+ ▁BACK 293
295
+ VO 294
296
+ ANT 295
297
+ DI 296
298
+ ▁ALSO 297
299
+ ▁THOSE 298
300
+ ▁MAN 299
301
+ CTION 300
302
+ ICAL 301
303
+ ▁JO 302
304
+ ▁OP 303
305
+ ▁NEW 304
306
+ ▁MU 305
307
+ ▁HU 306
308
+ ▁KIND 307
309
+ ▁NE 308
310
+ CA 309
311
+ END 310
312
+ TIC 311
313
+ FUL 312
314
+ ▁YEAH 313
315
+ SH 314
316
+ ▁APP 315
317
+ ▁THINGS 316
318
+ SIDE 317
319
+ ▁GOOD 318
320
+ ONE 319
321
+ ▁TAKE 320
322
+ CU 321
323
+ ▁EVERY 322
324
+ ▁MEAN 323
325
+ ▁FIRST 324
326
+ OP 325
327
+ ▁TH 326
328
+ ▁MUCH 327
329
+ ▁PART 328
330
+ UGH 329
331
+ ▁COME 330
332
+ J 331
333
+ ▁THAN 332
334
+ ▁EXP 333
335
+ ▁AGAIN 334
336
+ ▁LITTLE 335
337
+ MB 336
338
+ ▁NEED 337
339
+ ▁TALK 338
340
+ IF 339
341
+ FOR 340
342
+ ▁SH 341
343
+ ISH 342
344
+ ▁STA 343
345
+ ATED 344
346
+ ▁GU 345
347
+ ▁LET 346
348
+ IA 347
349
+ ▁MAR 348
350
+ ▁DOWN 349
351
+ ▁DAY 350
352
+ ▁GA 351
353
+ ▁SOMETHING 352
354
+ ▁BU 353
355
+ DUC 354
356
+ HA 355
357
+ ▁LOT 356
358
+ ▁RU 357
359
+ ▁THOUGH 358
360
+ ▁GREAT 359
361
+ AIN 360
362
+ ▁THROUGH 361
363
+ ▁THING 362
364
+ OUS 363
365
+ ▁PRI 364
366
+ ▁GOT 365
367
+ ▁SHOULD 366
368
+ ▁AFTER 367
369
+ ▁HEAR 368
370
+ ▁TA 369
371
+ ▁ONLY 370
372
+ ▁CHI 371
373
+ IOUS 372
374
+ ▁SHA 373
375
+ ▁MOST 374
376
+ ▁ACTUALLY 375
377
+ ▁START 376
378
+ LIC 377
379
+ ▁VA 378
380
+ ▁RI 379
381
+ DAY 380
382
+ IAN 381
383
+ ▁DOES 382
384
+ ROW 383
385
+ ▁GRA 384
386
+ ITION 385
387
+ ▁MANY 386
388
+ ▁BEFORE 387
389
+ ▁GIVE 388
390
+ PORT 389
391
+ QUI 390
392
+ ▁LIFE 391
393
+ ▁WORLD 392
394
+ ▁PI 393
395
+ ▁LONG 394
396
+ ▁THREE 395
397
+ IZE 396
398
+ NESS 397
399
+ ▁SHOW 398
400
+ PH 399
401
+ ▁WHY 400
402
+ ▁QUESTION 401
403
+ WARD 402
404
+ ▁THANK 403
405
+ ▁PH 404
406
+ ▁DIFFERENT 405
407
+ ▁OWN 406
408
+ ▁FEEL 407
409
+ ▁MIGHT 408
410
+ ▁HAPPEN 409
411
+ ▁MADE 410
412
+ ▁BRO 411
413
+ IBLE 412
414
+ ▁HI 413
415
+ ▁STATE 414
416
+ ▁HAND 415
417
+ ▁NEVER 416
418
+ ▁PLACE 417
419
+ ▁LOVE 418
420
+ ▁DU 419
421
+ ▁POINT 420
422
+ ▁HELP 421
423
+ ▁COUNT 422
424
+ ▁STILL 423
425
+ ▁MR 424
426
+ ▁FIND 425
427
+ ▁PERSON 426
428
+ ▁CAME 427
429
+ ▁SAME 428
430
+ ▁LAST 429
431
+ ▁HIGH 430
432
+ ▁OLD 431
433
+ ▁UNDER 432
434
+ ▁FOUR 433
435
+ ▁AROUND 434
436
+ ▁SORT 435
437
+ ▁CHANGE 436
438
+ ▁YES 437
439
+ SHIP 438
440
+ ▁ANOTHER 439
441
+ ATIVE 440
442
+ ▁FOUND 441
443
+ ▁JA 442
444
+ ▁ALWAYS 443
445
+ ▁NEXT 444
446
+ ▁TURN 445
447
+ ▁JU 446
448
+ ▁SIX 447
449
+ ▁FACT 448
450
+ ▁INTEREST 449
451
+ ▁WORD 450
452
+ ▁THOUSAND 451
453
+ ▁HUNDRED 452
454
+ ▁NUMBER 453
455
+ ▁IDEA 454
456
+ ▁PLAN 455
457
+ ▁COURSE 456
458
+ ▁SCHOOL 457
459
+ ▁HOUSE 458
460
+ ▁TWENTY 459
461
+ ▁JE 460
462
+ ▁PLAY 461
463
+ ▁AWAY 462
464
+ ▁LEARN 463
465
+ ▁HARD 464
466
+ ▁WEEK 465
467
+ ▁BETTER 466
468
+ ▁WHILE 467
469
+ ▁FRIEND 468
470
+ ▁OKAY 469
471
+ ▁NINE 470
472
+ ▁UNDERSTAND 471
473
+ ▁KEEP 472
474
+ ▁GONNA 473
475
+ ▁SYSTEM 474
476
+ ▁AMERICA 475
477
+ ▁POWER 476
478
+ ▁IMPORTANT 477
479
+ ▁WITHOUT 478
480
+ ▁MAYBE 479
481
+ ▁SEVEN 480
482
+ ▁BETWEEN 481
483
+ ▁BUILD 482
484
+ ▁CERTAIN 483
485
+ ▁PROBLEM 484
486
+ ▁MONEY 485
487
+ ▁BELIEVE 486
488
+ ▁SECOND 487
489
+ ▁REASON 488
490
+ ▁TOGETHER 489
491
+ ▁PUBLIC 490
492
+ ▁ANYTHING 491
493
+ ▁SPEAK 492
494
+ ▁BUSINESS 493
495
+ ▁EVERYTHING 494
496
+ ▁CLOSE 495
497
+ ▁QUITE 496
498
+ ▁ANSWER 497
499
+ ▁ENOUGH 498
500
+ Q 499
model.py ADDED
@@ -0,0 +1,1001 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
2
+ #
3
+ # See LICENSE for clarification regarding multiple authors
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import os
18
+ from functools import lru_cache
19
+ from typing import Union
20
+
21
+ import torch
22
+ import torchaudio
23
+ from huggingface_hub import hf_hub_download
24
+
25
+ os.system(
26
+ "cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/"
27
+ )
28
+
29
+ import k2 # noqa
30
+ import sherpa
31
+ import sherpa_onnx
32
+ import numpy as np
33
+ from typing import Tuple
34
+ import wave
35
+
36
+ sample_rate = 16000
37
+
38
+
39
+ def read_wave(wave_filename: str) -> Tuple[np.ndarray, int]:
40
+ """
41
+ Args:
42
+ wave_filename:
43
+ Path to a wave file. It should be single channel and each sample should
44
+ be 16-bit. Its sample rate does not need to be 16kHz.
45
+ Returns:
46
+ Return a tuple containing:
47
+ - A 1-D array of dtype np.float32 containing the samples, which are
48
+ normalized to the range [-1, 1].
49
+ - sample rate of the wave file
50
+ """
51
+
52
+ with wave.open(wave_filename) as f:
53
+ assert f.getnchannels() == 1, f.getnchannels()
54
+ assert f.getsampwidth() == 2, f.getsampwidth() # it is in bytes
55
+ num_samples = f.getnframes()
56
+ samples = f.readframes(num_samples)
57
+ samples_int16 = np.frombuffer(samples, dtype=np.int16)
58
+ samples_float32 = samples_int16.astype(np.float32)
59
+
60
+ samples_float32 = samples_float32 / 32768
61
+ return samples_float32, f.getframerate()
62
+
63
+
64
+ def decode_offline_recognizer(
65
+ recognizer: sherpa.OfflineRecognizer,
66
+ filename: str,
67
+ ) -> str:
68
+ s = recognizer.create_stream()
69
+
70
+ s.accept_wave_file(filename)
71
+ recognizer.decode_stream(s)
72
+
73
+ text = s.result.text.strip()
74
+ return text.lower()
75
+
76
+
77
+ def decode_online_recognizer(
78
+ recognizer: sherpa.OnlineRecognizer,
79
+ filename: str,
80
+ ) -> str:
81
+ samples, actual_sample_rate = torchaudio.load(filename)
82
+ assert sample_rate == actual_sample_rate, (
83
+ sample_rate,
84
+ actual_sample_rate,
85
+ )
86
+ samples = samples[0].contiguous()
87
+
88
+ s = recognizer.create_stream()
89
+
90
+ tail_padding = torch.zeros(int(sample_rate * 0.3), dtype=torch.float32)
91
+ s.accept_waveform(sample_rate, samples)
92
+ s.accept_waveform(sample_rate, tail_padding)
93
+ s.input_finished()
94
+
95
+ while recognizer.is_ready(s):
96
+ recognizer.decode_stream(s)
97
+
98
+ text = recognizer.get_result(s).text
99
+ return text.strip().lower()
100
+
101
+
102
+ def decode_offline_recognizer_sherpa_onnx(
103
+ recognizer: sherpa_onnx.OfflineRecognizer,
104
+ filename: str,
105
+ ) -> str:
106
+ s = recognizer.create_stream()
107
+ samples, sample_rate = read_wave(filename)
108
+ s.accept_waveform(sample_rate, samples)
109
+ recognizer.decode_stream(s)
110
+
111
+ return s.result.text.lower()
112
+
113
+
114
+ def decode_online_recognizer_sherpa_onnx(
115
+ recognizer: sherpa_onnx.OnlineRecognizer,
116
+ filename: str,
117
+ ) -> str:
118
+ s = recognizer.create_stream()
119
+ samples, sample_rate = read_wave(filename)
120
+ s.accept_waveform(sample_rate, samples)
121
+
122
+ tail_paddings = np.zeros(int(0.3 * sample_rate), dtype=np.float32)
123
+ s.accept_waveform(sample_rate, tail_paddings)
124
+ s.input_finished()
125
+
126
+ while recognizer.is_ready(s):
127
+ recognizer.decode_stream(s)
128
+
129
+ return recognizer.get_result(s).lower()
130
+
131
+
132
+ def decode(
133
+ recognizer: Union[
134
+ sherpa.OfflineRecognizer,
135
+ sherpa.OnlineRecognizer,
136
+ sherpa_onnx.OfflineRecognizer,
137
+ sherpa_onnx.OnlineRecognizer,
138
+ ],
139
+ filename: str,
140
+ ) -> str:
141
+ if isinstance(recognizer, sherpa.OfflineRecognizer):
142
+ return decode_offline_recognizer(recognizer, filename)
143
+ elif isinstance(recognizer, sherpa.OnlineRecognizer):
144
+ return decode_online_recognizer(recognizer, filename)
145
+ elif isinstance(recognizer, sherpa_onnx.OfflineRecognizer):
146
+ return decode_offline_recognizer_sherpa_onnx(recognizer, filename)
147
+ elif isinstance(recognizer, sherpa_onnx.OnlineRecognizer):
148
+ return decode_online_recognizer_sherpa_onnx(recognizer, filename)
149
+ else:
150
+ raise ValueError(f"Unknown recognizer type {type(recognizer)}")
151
+
152
+
153
+ @lru_cache(maxsize=30)
154
+ def get_pretrained_model(
155
+ repo_id: str,
156
+ decoding_method: str,
157
+ num_active_paths: int,
158
+ ) -> Union[sherpa.OfflineRecognizer, sherpa.OnlineRecognizer]:
159
+ if repo_id in chinese_models:
160
+ return chinese_models[repo_id](
161
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
162
+ )
163
+ elif repo_id in english_models:
164
+ return english_models[repo_id](
165
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
166
+ )
167
+ elif repo_id in chinese_english_mixed_models:
168
+ return chinese_english_mixed_models[repo_id](
169
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
170
+ )
171
+ elif repo_id in tibetan_models:
172
+ return tibetan_models[repo_id](
173
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
174
+ )
175
+ elif repo_id in arabic_models:
176
+ return arabic_models[repo_id](
177
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
178
+ )
179
+ elif repo_id in german_models:
180
+ return german_models[repo_id](
181
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
182
+ )
183
+ elif repo_id in french_models:
184
+ return french_models[repo_id](
185
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
186
+ )
187
+ elif repo_id in japanese_models:
188
+ return japanese_models[repo_id](
189
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
190
+ )
191
+ elif repo_id in russian_models:
192
+ return russian_models[repo_id](
193
+ repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
194
+ )
195
+ else:
196
+ raise ValueError(f"Unsupported repo_id: {repo_id}")
197
+
198
+
199
+ def _get_nn_model_filename(
200
+ repo_id: str,
201
+ filename: str,
202
+ subfolder: str = "exp",
203
+ ) -> str:
204
+ nn_model_filename = hf_hub_download(
205
+ repo_id=repo_id,
206
+ filename=filename,
207
+ subfolder=subfolder,
208
+ )
209
+ return nn_model_filename
210
+
211
+
212
+ def _get_bpe_model_filename(
213
+ repo_id: str,
214
+ filename: str = "bpe.model",
215
+ subfolder: str = "data/lang_bpe_500",
216
+ ) -> str:
217
+ bpe_model_filename = hf_hub_download(
218
+ repo_id=repo_id,
219
+ filename=filename,
220
+ subfolder=subfolder,
221
+ )
222
+ return bpe_model_filename
223
+
224
+
225
+ def _get_token_filename(
226
+ repo_id: str,
227
+ filename: str = "tokens.txt",
228
+ subfolder: str = "data/lang_char",
229
+ ) -> str:
230
+ token_filename = hf_hub_download(
231
+ repo_id=repo_id,
232
+ filename=filename,
233
+ subfolder=subfolder,
234
+ )
235
+ return token_filename
236
+
237
+
238
+ @lru_cache(maxsize=10)
239
+ def _get_aishell2_pretrained_model(
240
+ repo_id: str,
241
+ decoding_method: str,
242
+ num_active_paths: int,
243
+ ) -> sherpa.OfflineRecognizer:
244
+ assert repo_id in [
245
+ # context-size 1
246
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12", # noqa
247
+ # context-size 2
248
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12", # noqa
249
+ ], repo_id
250
+
251
+ nn_model = _get_nn_model_filename(
252
+ repo_id=repo_id,
253
+ filename="cpu_jit.pt",
254
+ )
255
+ tokens = _get_token_filename(repo_id=repo_id)
256
+
257
+ feat_config = sherpa.FeatureConfig()
258
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
259
+ feat_config.fbank_opts.mel_opts.num_bins = 80
260
+ feat_config.fbank_opts.frame_opts.dither = 0
261
+
262
+ config = sherpa.OfflineRecognizerConfig(
263
+ nn_model=nn_model,
264
+ tokens=tokens,
265
+ use_gpu=False,
266
+ feat_config=feat_config,
267
+ decoding_method=decoding_method,
268
+ num_active_paths=num_active_paths,
269
+ )
270
+
271
+ recognizer = sherpa.OfflineRecognizer(config)
272
+
273
+ return recognizer
274
+
275
+
276
+ @lru_cache(maxsize=10)
277
+ def _get_russian_pre_trained_model(
278
+ repo_id: str, decoding_method: str, num_active_paths: int
279
+ ) -> sherpa_onnx.OfflineRecognizer:
280
+ assert repo_id in (
281
+ "alphacep/vosk-model-ru",
282
+ "alphacep/vosk-model-small-ru",
283
+ ), repo_id
284
+
285
+ if repo_id == "alphacep/vosk-model-ru":
286
+ model_dir = "am-onnx"
287
+ elif repo_id == "alphacep/vosk-model-small-ru":
288
+ model_dir = "am"
289
+
290
+ encoder_model = _get_nn_model_filename(
291
+ repo_id=repo_id,
292
+ filename="encoder.onnx",
293
+ subfolder=model_dir,
294
+ )
295
+
296
+ decoder_model = _get_nn_model_filename(
297
+ repo_id=repo_id,
298
+ filename="decoder.onnx",
299
+ subfolder=model_dir,
300
+ )
301
+
302
+ joiner_model = _get_nn_model_filename(
303
+ repo_id=repo_id,
304
+ filename="joiner.onnx",
305
+ subfolder=model_dir,
306
+ )
307
+
308
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="lang")
309
+
310
+ recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
311
+ tokens=tokens,
312
+ encoder=encoder_model,
313
+ decoder=decoder_model,
314
+ joiner=joiner_model,
315
+ num_threads=2,
316
+ sample_rate=16000,
317
+ feature_dim=80,
318
+ decoding_method=decoding_method,
319
+ )
320
+
321
+ return recognizer
322
+
323
+
324
+ @lru_cache(maxsize=10)
325
+ def _get_whisper_model(
326
+ repo_id: str, decoding_method: str, num_active_paths: int
327
+ ) -> sherpa_onnx.OfflineRecognizer:
328
+ name = repo_id.split("-")[1]
329
+ assert name in ("tiny.en", "base.en", "small.en", "medium.en"), repo_id
330
+ full_repo_id = "csukuangfj/sherpa-onnx-whisper-" + name
331
+ encoder = _get_nn_model_filename(
332
+ repo_id=full_repo_id,
333
+ filename=f"{name}-encoder.int8.ort",
334
+ subfolder=".",
335
+ )
336
+
337
+ decoder = _get_nn_model_filename(
338
+ repo_id=full_repo_id,
339
+ filename=f"{name}-decoder.int8.ort",
340
+ subfolder=".",
341
+ )
342
+
343
+ tokens = _get_token_filename(
344
+ repo_id=full_repo_id, subfolder=".", filename=f"{name}-tokens.txt"
345
+ )
346
+
347
+ recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
348
+ encoder=encoder,
349
+ decoder=decoder,
350
+ tokens=tokens,
351
+ num_threads=2,
352
+ )
353
+
354
+ return recognizer
355
+
356
+
357
+ @lru_cache(maxsize=10)
358
+ def _get_gigaspeech_pre_trained_model(
359
+ repo_id: str,
360
+ decoding_method: str,
361
+ num_active_paths: int,
362
+ ) -> sherpa.OfflineRecognizer:
363
+ assert repo_id in [
364
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
365
+ ], repo_id
366
+
367
+ nn_model = _get_nn_model_filename(
368
+ repo_id=repo_id,
369
+ filename="cpu_jit-iter-3488000-avg-20.pt",
370
+ )
371
+ tokens = "./giga-tokens.txt"
372
+
373
+ feat_config = sherpa.FeatureConfig()
374
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
375
+ feat_config.fbank_opts.mel_opts.num_bins = 80
376
+ feat_config.fbank_opts.frame_opts.dither = 0
377
+
378
+ config = sherpa.OfflineRecognizerConfig(
379
+ nn_model=nn_model,
380
+ tokens=tokens,
381
+ use_gpu=False,
382
+ feat_config=feat_config,
383
+ decoding_method=decoding_method,
384
+ num_active_paths=num_active_paths,
385
+ )
386
+
387
+ recognizer = sherpa.OfflineRecognizer(config)
388
+
389
+ return recognizer
390
+
391
+
392
+ @lru_cache(maxsize=10)
393
+ def _get_english_model(
394
+ repo_id: str,
395
+ decoding_method: str,
396
+ num_active_paths: int,
397
+ ) -> sherpa.OfflineRecognizer:
398
+ assert repo_id in [
399
+ "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02", # noqa
400
+ "yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04", # noqa
401
+ "yfyeung/icefall-asr-finetune-mux-pruned_transducer_stateless7-2023-05-19", # noqa
402
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", # noqa
403
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11", # noqa
404
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14", # noqa
405
+ "Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16", # noqa
406
+ "Zengwei/icefall-asr-librispeech-zipformer-2023-05-15", # noqa
407
+ "Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16", # noqa
408
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
409
+ "pkufool/icefall_asr_librispeech_conformer_ctc",
410
+ "WayneWiser/icefall-asr-librispeech-conformer-ctc2-jit-bpe-500-2022-07-21",
411
+ ], repo_id
412
+
413
+ filename = "cpu_jit.pt"
414
+ if (
415
+ repo_id
416
+ == "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11"
417
+ ):
418
+ filename = "cpu_jit-torch-1.10.0.pt"
419
+
420
+ if (
421
+ repo_id
422
+ == "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02"
423
+ ):
424
+ filename = "cpu_jit-torch-1.10.pt"
425
+
426
+ if (
427
+ repo_id
428
+ == "yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04"
429
+ ):
430
+ filename = "cpu_jit-epoch-30-avg-4.pt"
431
+
432
+ if (
433
+ repo_id
434
+ == "yfyeung/icefall-asr-finetune-mux-pruned_transducer_stateless7-2023-05-19"
435
+ ):
436
+ filename = "cpu_jit-epoch-20-avg-5.pt"
437
+
438
+ if repo_id in (
439
+ "Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16",
440
+ "Zengwei/icefall-asr-librispeech-zipformer-2023-05-15",
441
+ "Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16",
442
+ ):
443
+ filename = "jit_script.pt"
444
+
445
+ nn_model = _get_nn_model_filename(
446
+ repo_id=repo_id,
447
+ filename=filename,
448
+ )
449
+ subfolder = "data/lang_bpe_500"
450
+
451
+ if repo_id in (
452
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
453
+ "pkufool/icefall_asr_librispeech_conformer_ctc",
454
+ ):
455
+ subfolder = "data/lang_bpe"
456
+
457
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=subfolder)
458
+
459
+ feat_config = sherpa.FeatureConfig()
460
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
461
+ feat_config.fbank_opts.mel_opts.num_bins = 80
462
+ feat_config.fbank_opts.frame_opts.dither = 0
463
+
464
+ config = sherpa.OfflineRecognizerConfig(
465
+ nn_model=nn_model,
466
+ tokens=tokens,
467
+ use_gpu=False,
468
+ feat_config=feat_config,
469
+ decoding_method=decoding_method,
470
+ num_active_paths=num_active_paths,
471
+ )
472
+
473
+ recognizer = sherpa.OfflineRecognizer(config)
474
+
475
+ return recognizer
476
+
477
+
478
+ @lru_cache(maxsize=10)
479
+ def _get_wenetspeech_pre_trained_model(
480
+ repo_id: str,
481
+ decoding_method: str,
482
+ num_active_paths: int,
483
+ ):
484
+ assert repo_id in [
485
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
486
+ ], repo_id
487
+
488
+ nn_model = _get_nn_model_filename(
489
+ repo_id=repo_id,
490
+ filename="cpu_jit_epoch_10_avg_2_torch_1.7.1.pt",
491
+ )
492
+ tokens = _get_token_filename(repo_id=repo_id)
493
+
494
+ feat_config = sherpa.FeatureConfig()
495
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
496
+ feat_config.fbank_opts.mel_opts.num_bins = 80
497
+ feat_config.fbank_opts.frame_opts.dither = 0
498
+
499
+ config = sherpa.OfflineRecognizerConfig(
500
+ nn_model=nn_model,
501
+ tokens=tokens,
502
+ use_gpu=False,
503
+ feat_config=feat_config,
504
+ decoding_method=decoding_method,
505
+ num_active_paths=num_active_paths,
506
+ )
507
+
508
+ recognizer = sherpa.OfflineRecognizer(config)
509
+
510
+ return recognizer
511
+
512
+
513
+ @lru_cache(maxsize=10)
514
+ def _get_chinese_english_mixed_model(
515
+ repo_id: str,
516
+ decoding_method: str,
517
+ num_active_paths: int,
518
+ ):
519
+ assert repo_id in [
520
+ "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
521
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh",
522
+ ], repo_id
523
+
524
+ if repo_id == "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5":
525
+ filename = "cpu_jit.pt"
526
+ subfolder = "data/lang_char"
527
+ elif repo_id == "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh":
528
+ filename = "cpu_jit-epoch-11-avg-1.pt"
529
+ subfolder = "data/lang_char_bpe"
530
+
531
+ nn_model = _get_nn_model_filename(
532
+ repo_id=repo_id,
533
+ filename=filename,
534
+ )
535
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=subfolder)
536
+
537
+ feat_config = sherpa.FeatureConfig()
538
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
539
+ feat_config.fbank_opts.mel_opts.num_bins = 80
540
+ feat_config.fbank_opts.frame_opts.dither = 0
541
+
542
+ config = sherpa.OfflineRecognizerConfig(
543
+ nn_model=nn_model,
544
+ tokens=tokens,
545
+ use_gpu=False,
546
+ feat_config=feat_config,
547
+ decoding_method=decoding_method,
548
+ num_active_paths=num_active_paths,
549
+ )
550
+
551
+ recognizer = sherpa.OfflineRecognizer(config)
552
+
553
+ return recognizer
554
+
555
+
556
+ @lru_cache(maxsize=10)
557
+ def _get_alimeeting_pre_trained_model(
558
+ repo_id: str,
559
+ decoding_method: str,
560
+ num_active_paths: int,
561
+ ):
562
+ assert repo_id in [
563
+ "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
564
+ "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
565
+ ], repo_id
566
+
567
+ if repo_id == "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7":
568
+ filename = "cpu_jit.pt"
569
+ elif repo_id == "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2":
570
+ filename = "cpu_jit_torch_1.7.1.pt"
571
+
572
+ nn_model = _get_nn_model_filename(
573
+ repo_id=repo_id,
574
+ filename=filename,
575
+ )
576
+ tokens = _get_token_filename(repo_id=repo_id)
577
+
578
+ feat_config = sherpa.FeatureConfig()
579
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
580
+ feat_config.fbank_opts.mel_opts.num_bins = 80
581
+ feat_config.fbank_opts.frame_opts.dither = 0
582
+
583
+ config = sherpa.OfflineRecognizerConfig(
584
+ nn_model=nn_model,
585
+ tokens=tokens,
586
+ use_gpu=False,
587
+ feat_config=feat_config,
588
+ decoding_method=decoding_method,
589
+ num_active_paths=num_active_paths,
590
+ )
591
+
592
+ recognizer = sherpa.OfflineRecognizer(config)
593
+
594
+ return recognizer
595
+
596
+
597
+ @lru_cache(maxsize=10)
598
+ def _get_wenet_model(
599
+ repo_id: str,
600
+ decoding_method: str,
601
+ num_active_paths: int,
602
+ ):
603
+ assert repo_id in [
604
+ "csukuangfj/wenet-chinese-model",
605
+ "csukuangfj/wenet-english-model",
606
+ ], repo_id
607
+
608
+ nn_model = _get_nn_model_filename(
609
+ repo_id=repo_id,
610
+ filename="final.zip",
611
+ subfolder=".",
612
+ )
613
+ tokens = _get_token_filename(
614
+ repo_id=repo_id,
615
+ filename="units.txt",
616
+ subfolder=".",
617
+ )
618
+
619
+ feat_config = sherpa.FeatureConfig(normalize_samples=False)
620
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
621
+ feat_config.fbank_opts.mel_opts.num_bins = 80
622
+ feat_config.fbank_opts.frame_opts.dither = 0
623
+
624
+ config = sherpa.OfflineRecognizerConfig(
625
+ nn_model=nn_model,
626
+ tokens=tokens,
627
+ use_gpu=False,
628
+ feat_config=feat_config,
629
+ decoding_method=decoding_method,
630
+ num_active_paths=num_active_paths,
631
+ )
632
+
633
+ recognizer = sherpa.OfflineRecognizer(config)
634
+
635
+ return recognizer
636
+
637
+
638
+ @lru_cache(maxsize=10)
639
+ def _get_aidatatang_200zh_pretrained_mode(
640
+ repo_id: str,
641
+ decoding_method: str,
642
+ num_active_paths: int,
643
+ ):
644
+ assert repo_id in [
645
+ "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2",
646
+ ], repo_id
647
+
648
+ nn_model = _get_nn_model_filename(
649
+ repo_id=repo_id,
650
+ filename="cpu_jit_torch.1.7.1.pt",
651
+ )
652
+ tokens = _get_token_filename(repo_id=repo_id)
653
+
654
+ feat_config = sherpa.FeatureConfig()
655
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
656
+ feat_config.fbank_opts.mel_opts.num_bins = 80
657
+ feat_config.fbank_opts.frame_opts.dither = 0
658
+
659
+ config = sherpa.OfflineRecognizerConfig(
660
+ nn_model=nn_model,
661
+ tokens=tokens,
662
+ use_gpu=False,
663
+ feat_config=feat_config,
664
+ decoding_method=decoding_method,
665
+ num_active_paths=num_active_paths,
666
+ )
667
+
668
+ recognizer = sherpa.OfflineRecognizer(config)
669
+
670
+ return recognizer
671
+
672
+
673
+ @lru_cache(maxsize=10)
674
+ def _get_tibetan_pre_trained_model(
675
+ repo_id: str,
676
+ decoding_method: str,
677
+ num_active_paths: int,
678
+ ):
679
+ assert repo_id in [
680
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
681
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29",
682
+ ], repo_id
683
+
684
+ filename = "cpu_jit.pt"
685
+ if (
686
+ repo_id
687
+ == "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29"
688
+ ):
689
+ filename = "cpu_jit-epoch-28-avg-23-torch-1.10.0.pt"
690
+
691
+ nn_model = _get_nn_model_filename(
692
+ repo_id=repo_id,
693
+ filename=filename,
694
+ )
695
+
696
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
697
+
698
+ feat_config = sherpa.FeatureConfig()
699
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
700
+ feat_config.fbank_opts.mel_opts.num_bins = 80
701
+ feat_config.fbank_opts.frame_opts.dither = 0
702
+
703
+ config = sherpa.OfflineRecognizerConfig(
704
+ nn_model=nn_model,
705
+ tokens=tokens,
706
+ use_gpu=False,
707
+ feat_config=feat_config,
708
+ decoding_method=decoding_method,
709
+ num_active_paths=num_active_paths,
710
+ )
711
+
712
+ recognizer = sherpa.OfflineRecognizer(config)
713
+
714
+ return recognizer
715
+
716
+
717
+ @lru_cache(maxsize=10)
718
+ def _get_arabic_pre_trained_model(
719
+ repo_id: str,
720
+ decoding_method: str,
721
+ num_active_paths: int,
722
+ ):
723
+ assert repo_id in [
724
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06",
725
+ ], repo_id
726
+
727
+ nn_model = _get_nn_model_filename(
728
+ repo_id=repo_id,
729
+ filename="cpu_jit.pt",
730
+ )
731
+
732
+ tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_5000")
733
+
734
+ feat_config = sherpa.FeatureConfig()
735
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
736
+ feat_config.fbank_opts.mel_opts.num_bins = 80
737
+ feat_config.fbank_opts.frame_opts.dither = 0
738
+
739
+ config = sherpa.OfflineRecognizerConfig(
740
+ nn_model=nn_model,
741
+ tokens=tokens,
742
+ use_gpu=False,
743
+ feat_config=feat_config,
744
+ decoding_method=decoding_method,
745
+ num_active_paths=num_active_paths,
746
+ )
747
+
748
+ recognizer = sherpa.OfflineRecognizer(config)
749
+
750
+ return recognizer
751
+
752
+
753
+ @lru_cache(maxsize=10)
754
+ def _get_german_pre_trained_model(
755
+ repo_id: str,
756
+ decoding_method: str,
757
+ num_active_paths: int,
758
+ ):
759
+ assert repo_id in [
760
+ "csukuangfj/wav2vec2.0-torchaudio",
761
+ ], repo_id
762
+
763
+ nn_model = _get_nn_model_filename(
764
+ repo_id=repo_id,
765
+ filename="voxpopuli_asr_base_10k_de.pt",
766
+ subfolder=".",
767
+ )
768
+
769
+ tokens = _get_token_filename(
770
+ repo_id=repo_id,
771
+ filename="tokens-de.txt",
772
+ subfolder=".",
773
+ )
774
+
775
+ config = sherpa.OfflineRecognizerConfig(
776
+ nn_model=nn_model,
777
+ tokens=tokens,
778
+ use_gpu=False,
779
+ decoding_method=decoding_method,
780
+ num_active_paths=num_active_paths,
781
+ )
782
+
783
+ recognizer = sherpa.OfflineRecognizer(config)
784
+
785
+ return recognizer
786
+
787
+
788
+ @lru_cache(maxsize=10)
789
+ def _get_french_pre_trained_model(
790
+ repo_id: str,
791
+ decoding_method: str,
792
+ num_active_paths: int,
793
+ ):
794
+ assert repo_id in [
795
+ "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14",
796
+ ], repo_id
797
+
798
+ encoder_model = _get_nn_model_filename(
799
+ repo_id=repo_id,
800
+ filename="encoder-epoch-29-avg-9-with-averaged-model.onnx",
801
+ subfolder=".",
802
+ )
803
+
804
+ decoder_model = _get_nn_model_filename(
805
+ repo_id=repo_id,
806
+ filename="decoder-epoch-29-avg-9-with-averaged-model.onnx",
807
+ subfolder=".",
808
+ )
809
+
810
+ joiner_model = _get_nn_model_filename(
811
+ repo_id=repo_id,
812
+ filename="joiner-epoch-29-avg-9-with-averaged-model.onnx",
813
+ subfolder=".",
814
+ )
815
+
816
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
817
+
818
+ recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
819
+ tokens=tokens,
820
+ encoder=encoder_model,
821
+ decoder=decoder_model,
822
+ joiner=joiner_model,
823
+ num_threads=2,
824
+ sample_rate=16000,
825
+ feature_dim=80,
826
+ decoding_method=decoding_method,
827
+ max_active_paths=num_active_paths,
828
+ )
829
+
830
+ return recognizer
831
+
832
+
833
+ @lru_cache(maxsize=10)
834
+ def _get_japanese_pre_trained_model(
835
+ repo_id: str,
836
+ decoding_method: str,
837
+ num_active_paths: int,
838
+ ) -> sherpa.OnlineRecognizer:
839
+ repo_id, kind = repo_id.rsplit("-", maxsplit=1)
840
+
841
+ assert repo_id in [
842
+ "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208"
843
+ ], repo_id
844
+ assert kind in ("fluent", "disfluent"), kind
845
+
846
+ encoder_model = _get_nn_model_filename(
847
+ repo_id=repo_id, filename="encoder_jit_trace.pt", subfolder=f"exp_{kind}"
848
+ )
849
+
850
+ decoder_model = _get_nn_model_filename(
851
+ repo_id=repo_id, filename="decoder_jit_trace.pt", subfolder=f"exp_{kind}"
852
+ )
853
+
854
+ joiner_model = _get_nn_model_filename(
855
+ repo_id=repo_id, filename="joiner_jit_trace.pt", subfolder=f"exp_{kind}"
856
+ )
857
+
858
+ tokens = _get_token_filename(repo_id=repo_id)
859
+
860
+ feat_config = sherpa.FeatureConfig()
861
+ feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
862
+ feat_config.fbank_opts.mel_opts.num_bins = 80
863
+ feat_config.fbank_opts.frame_opts.dither = 0
864
+
865
+ config = sherpa.OnlineRecognizerConfig(
866
+ nn_model="",
867
+ encoder_model=encoder_model,
868
+ decoder_model=decoder_model,
869
+ joiner_model=joiner_model,
870
+ tokens=tokens,
871
+ use_gpu=False,
872
+ feat_config=feat_config,
873
+ decoding_method=decoding_method,
874
+ num_active_paths=num_active_paths,
875
+ chunk_size=32,
876
+ )
877
+
878
+ recognizer = sherpa.OnlineRecognizer(config)
879
+
880
+ return recognizer
881
+
882
+
883
+ @lru_cache(maxsize=10)
884
+ def _get_paraformer_zh_pre_trained_model(
885
+ repo_id: str,
886
+ decoding_method: str,
887
+ num_active_paths: int,
888
+ ) -> sherpa_onnx.OfflineRecognizer:
889
+ assert repo_id in [
890
+ "csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28",
891
+ ], repo_id
892
+
893
+ nn_model = _get_nn_model_filename(
894
+ repo_id=repo_id,
895
+ filename="model.onnx",
896
+ subfolder=".",
897
+ )
898
+
899
+ tokens = _get_token_filename(repo_id=repo_id, subfolder=".")
900
+
901
+ recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer(
902
+ paraformer=nn_model,
903
+ tokens=tokens,
904
+ num_threads=2,
905
+ sample_rate=sample_rate,
906
+ feature_dim=80,
907
+ decoding_method="greedy_search",
908
+ debug=False,
909
+ )
910
+
911
+ return recognizer
912
+
913
+
914
+ chinese_models = {
915
+ "csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28": _get_paraformer_zh_pre_trained_model,
916
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model, # noqa
917
+ "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7": _get_alimeeting_pre_trained_model,
918
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12": _get_aishell2_pretrained_model, # noqa
919
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12": _get_aishell2_pretrained_model, # noqa
920
+ "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2": _get_aidatatang_200zh_pretrained_mode, # noqa
921
+ "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": _get_alimeeting_pre_trained_model, # noqa
922
+ "csukuangfj/wenet-chinese-model": _get_wenet_model,
923
+ # "csukuangfj/icefall-asr-wenetspeech-lstm-transducer-stateless-2022-10-14": _get_lstm_transducer_model,
924
+ }
925
+
926
+ english_models = {
927
+ "whisper-tiny.en": _get_whisper_model,
928
+ "whisper-base.en": _get_whisper_model,
929
+ "whisper-small.en": _get_whisper_model,
930
+ # "whisper-medium.en": _get_whisper_model,
931
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2": _get_gigaspeech_pre_trained_model, # noqa
932
+ "yfyeung/icefall-asr-multidataset-pruned_transducer_stateless7-2023-05-04": _get_english_model, # noqa
933
+ "yfyeung/icefall-asr-finetune-mux-pruned_transducer_stateless7-2023-05-19": _get_english_model, # noqa
934
+ "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02": _get_english_model, # noqa
935
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14": _get_english_model, # noqa
936
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11": _get_english_model, # noqa
937
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13": _get_english_model, # noqa
938
+ "Zengwei/icefall-asr-librispeech-zipformer-large-2023-05-16": _get_english_model, # noqa
939
+ "Zengwei/icefall-asr-librispeech-zipformer-2023-05-15": _get_english_model, # noqa
940
+ "Zengwei/icefall-asr-librispeech-zipformer-small-2023-05-16": _get_english_model, # noqa
941
+ "videodanchik/icefall-asr-tedlium3-conformer-ctc2": _get_english_model,
942
+ "pkufool/icefall_asr_librispeech_conformer_ctc": _get_english_model,
943
+ "WayneWiser/icefall-asr-librispeech-conformer-ctc2-jit-bpe-500-2022-07-21": _get_english_model,
944
+ "csukuangfj/wenet-english-model": _get_wenet_model,
945
+ }
946
+
947
+ chinese_english_mixed_models = {
948
+ "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh": _get_chinese_english_mixed_model,
949
+ "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5": _get_chinese_english_mixed_model, # noqa
950
+ }
951
+
952
+ tibetan_models = {
953
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02": _get_tibetan_pre_trained_model, # noqa
954
+ "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29": _get_tibetan_pre_trained_model, # noqa
955
+ }
956
+
957
+ arabic_models = {
958
+ "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06": _get_arabic_pre_trained_model, # noqa
959
+ }
960
+
961
+ german_models = {
962
+ "csukuangfj/wav2vec2.0-torchaudio": _get_german_pre_trained_model,
963
+ }
964
+
965
+ french_models = {
966
+ "shaojieli/sherpa-onnx-streaming-zipformer-fr-2023-04-14": _get_french_pre_trained_model,
967
+ }
968
+
969
+ japanese_models = {
970
+ "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-fluent": _get_japanese_pre_trained_model,
971
+ "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-disfluent": _get_japanese_pre_trained_model,
972
+ }
973
+
974
+ russian_models = {
975
+ "alphacep/vosk-model-ru": _get_russian_pre_trained_model,
976
+ "alphacep/vosk-model-small-ru": _get_russian_pre_trained_model,
977
+ }
978
+
979
+ all_models = {
980
+ **chinese_models,
981
+ **english_models,
982
+ **chinese_english_mixed_models,
983
+ # **japanese_models,
984
+ **tibetan_models,
985
+ **arabic_models,
986
+ **german_models,
987
+ **french_models,
988
+ **russian_models,
989
+ }
990
+
991
+ language_to_models = {
992
+ "Chinese": list(chinese_models.keys()),
993
+ "English": list(english_models.keys()),
994
+ "Chinese+English": list(chinese_english_mixed_models.keys()),
995
+ # "Japanese": list(japanese_models.keys()),
996
+ "Tibetan": list(tibetan_models.keys()),
997
+ "Arabic": list(arabic_models.keys()),
998
+ "German": list(german_models.keys()),
999
+ "French": list(french_models.keys()),
1000
+ "Russian": list(russian_models.keys()),
1001
+ }
requirements (1).txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://download.pytorch.org/whl/cpu/torch-1.13.1%2Bcpu-cp38-cp38-linux_x86_64.whl
2
+ https://download.pytorch.org/whl/cpu/torchaudio-0.13.1%2Bcpu-cp38-cp38-linux_x86_64.whl
3
+
4
+ https://huggingface.co/csukuangfj/wheels/resolve/main/2023-01-30/k2-1.23.4.dev20230130%2Bcpu.torch1.13.1-cp38-cp38-linux_x86_64.whl
5
+ https://huggingface.co/csukuangfj/wheels/resolve/main/2023-01-30/k2_sherpa-1.1-cp38-cp38-linux_x86_64.whl
6
+ https://huggingface.co/csukuangfj/wheels/resolve/main/2023-01-30/kaldifeat-1.22-cp38-cp38-linux_x86_64.whl
7
+
8
+ sentencepiece>=0.1.96
9
+ numpy
10
+
11
+ huggingface_hub
12
+ sherpa-onnx>=1.7.0