Spaces:
Runtime error
Runtime error
Commit
•
8435a41
0
Parent(s):
Duplicate from k2-fsa/streaming-automatic-speech-recognition
Browse filesCo-authored-by: fangjun <csukuangfj@users.noreply.huggingface.co>
- .gitattributes +34 -0
- .gitignore +2 -0
- README.md +14 -0
- app.py +205 -0
- model.py +162 -0
- requirements.txt +11 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
flagged/
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Streaming Automatic Speech Recognition
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 2.9.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
duplicated_from: k2-fsa/streaming-automatic-speech-recognition
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
#
|
3 |
+
# Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang)
|
4 |
+
#
|
5 |
+
# See LICENSE for clarification regarding multiple authors
|
6 |
+
#
|
7 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
8 |
+
# you may not use this file except in compliance with the License.
|
9 |
+
# You may obtain a copy of the License at
|
10 |
+
#
|
11 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12 |
+
#
|
13 |
+
# Unless required by applicable law or agreed to in writing, software
|
14 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16 |
+
# See the License for the specific language governing permissions and
|
17 |
+
# limitations under the License.
|
18 |
+
|
19 |
+
# References:
|
20 |
+
# https://gradio.app/docs/#dropdown
|
21 |
+
|
22 |
+
import logging
|
23 |
+
import os
|
24 |
+
from typing import List, Optional
|
25 |
+
|
26 |
+
import gradio as gr
|
27 |
+
import torchaudio
|
28 |
+
|
29 |
+
from model import create_recognizer, language_to_models
|
30 |
+
|
31 |
+
title = "Next-gen Kaldi: Real-time streaming speech recognition"
|
32 |
+
description = """
|
33 |
+
This space shows how to do **real-time** streaming speech recognition
|
34 |
+
with **Next-gen Kaldi**.
|
35 |
+
|
36 |
+
Please visit
|
37 |
+
<https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>
|
38 |
+
for non-streaming speech recognition with **Next-gen Kaldi**.
|
39 |
+
|
40 |
+
It is running on CPU within a docker container provided by Hugging Face.
|
41 |
+
|
42 |
+
**Caution**: You may see **significant delay** since HuggingFace sends
|
43 |
+
your recorded data by chunks and the interval between chunks is
|
44 |
+
unknown, e.g., may be 2 seconds.
|
45 |
+
"""
|
46 |
+
|
47 |
+
article = """
|
48 |
+
See more information by visiting the following links:
|
49 |
+
|
50 |
+
- <https://github.com/k2-fsa/icefall>
|
51 |
+
- <https://github.com/k2-fsa/sherpa>
|
52 |
+
- <https://github.com/k2-fsa/k2>
|
53 |
+
- <https://github.com/lhotse-speech/lhotse>
|
54 |
+
|
55 |
+
If you want to deploy it locally, please see
|
56 |
+
<https://k2-fsa.github.io/sherpa/>
|
57 |
+
|
58 |
+
Usage instructions:
|
59 |
+
|
60 |
+
(1) Select a language and a model from the dropdown box
|
61 |
+
|
62 |
+
(2) Click the Record button to start
|
63 |
+
|
64 |
+
(3) Speak
|
65 |
+
|
66 |
+
(4) Click the Stop Recording button to stop
|
67 |
+
|
68 |
+
(5) **Remember to click the Clear button before you re-click the Record button**
|
69 |
+
|
70 |
+
(6) **Remember to click the Clear button before you re-click the Record button**
|
71 |
+
|
72 |
+
(7) **Remember to click the Clear button before you re-click the Record button**
|
73 |
+
"""
|
74 |
+
|
75 |
+
|
76 |
+
def convert_to_wav(in_filename: str) -> str:
|
77 |
+
"""Convert the input audio file to a wave file"""
|
78 |
+
out_filename = in_filename + ".wav"
|
79 |
+
# logging.info(f"Converting '{in_filename}' to '{out_filename}'")
|
80 |
+
_ = os.system(
|
81 |
+
f"ffmpeg -hide_banner -loglevel error -i '{in_filename}' -ar 16000 '{out_filename}'"
|
82 |
+
)
|
83 |
+
|
84 |
+
return out_filename
|
85 |
+
|
86 |
+
|
87 |
+
def get_language_and_model() -> List[str]:
|
88 |
+
"""
|
89 |
+
Each entry is of the following format:
|
90 |
+
language | repo_id
|
91 |
+
"""
|
92 |
+
ans = []
|
93 |
+
for language, repo_id_list in language_to_models.items():
|
94 |
+
for repo_id in repo_id_list:
|
95 |
+
ans.append(f"{language} | {repo_id}")
|
96 |
+
return ans
|
97 |
+
|
98 |
+
|
99 |
+
language_model_list = get_language_and_model()
|
100 |
+
|
101 |
+
|
102 |
+
def process(language_and_repo_id: str, audio: Optional[str], state=None):
|
103 |
+
"""
|
104 |
+
Args:
|
105 |
+
language_and_repo_id:
|
106 |
+
It contains "language | repo_id"
|
107 |
+
audio:
|
108 |
+
Path to the audio file. Not necessarily in wave format.
|
109 |
+
state:
|
110 |
+
If not None, it contains a list:
|
111 |
+
- error message if any
|
112 |
+
- language_and_repo_id
|
113 |
+
- recognizer
|
114 |
+
- stream
|
115 |
+
- wasOk
|
116 |
+
"""
|
117 |
+
language, repo_id = language_and_repo_id.split("|")
|
118 |
+
language = language.strip()
|
119 |
+
repo_id = repo_id.strip()
|
120 |
+
|
121 |
+
if state is None:
|
122 |
+
print("language", language)
|
123 |
+
print("repo_id", repo_id)
|
124 |
+
recognizer = create_recognizer(repo_id)
|
125 |
+
stream = recognizer.create_stream()
|
126 |
+
state = ["", language_and_repo_id, recognizer, stream, True]
|
127 |
+
|
128 |
+
if not state[-1]:
|
129 |
+
return state[0], state
|
130 |
+
|
131 |
+
if audio is None:
|
132 |
+
if "Error" in state[0]:
|
133 |
+
return state[0], state
|
134 |
+
else:
|
135 |
+
recognizer = state[2]
|
136 |
+
stream = state[3]
|
137 |
+
return recognizer.get_result(stream).text.lower()
|
138 |
+
|
139 |
+
if state[1] != language_and_repo_id:
|
140 |
+
state[0] = (
|
141 |
+
"Error: Please don't change the language and model during recognition "
|
142 |
+
+ "or "
|
143 |
+
+ "please press the Clear button before you re-click Record or re-select "
|
144 |
+
+ "language and model.\n\n\n"
|
145 |
+
+ "Hint: Click Stop Recording and then press Clear to fix this error."
|
146 |
+
)
|
147 |
+
state[-1] = False
|
148 |
+
|
149 |
+
return state[0], state
|
150 |
+
|
151 |
+
filename = convert_to_wav(audio)
|
152 |
+
|
153 |
+
samples, sample_rate = torchaudio.load(filename)
|
154 |
+
assert sample_rate == 16000, (sample_rate, 16000)
|
155 |
+
samples = samples.squeeze(0)
|
156 |
+
duration = samples.numel() / 16000
|
157 |
+
# logging.info(f"duration: {duration} s")
|
158 |
+
|
159 |
+
recognizer = state[2]
|
160 |
+
stream = state[3]
|
161 |
+
stream.accept_waveform(16000, samples)
|
162 |
+
|
163 |
+
while recognizer.is_ready(stream):
|
164 |
+
recognizer.decode_stream(stream)
|
165 |
+
|
166 |
+
text = recognizer.get_result(stream).text.lower()
|
167 |
+
logging.info(text)
|
168 |
+
|
169 |
+
return text, state
|
170 |
+
|
171 |
+
|
172 |
+
language_dropdown = gr.inputs.Dropdown(
|
173 |
+
label="Select a language and a model",
|
174 |
+
choices=language_model_list,
|
175 |
+
default=language_model_list[0],
|
176 |
+
)
|
177 |
+
|
178 |
+
itf1 = gr.Interface(
|
179 |
+
title=title,
|
180 |
+
description=description,
|
181 |
+
article=article,
|
182 |
+
fn=process,
|
183 |
+
inputs=[
|
184 |
+
language_dropdown,
|
185 |
+
gr.inputs.Audio(
|
186 |
+
source="microphone",
|
187 |
+
type="filepath",
|
188 |
+
label="Press me to start recognition",
|
189 |
+
),
|
190 |
+
"state",
|
191 |
+
],
|
192 |
+
outputs=[
|
193 |
+
gr.outputs.Textbox(type="str", label="result"),
|
194 |
+
gr.outputs.State(label=""),
|
195 |
+
],
|
196 |
+
live=True,
|
197 |
+
)
|
198 |
+
|
199 |
+
|
200 |
+
if __name__ == "__main__":
|
201 |
+
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
202 |
+
|
203 |
+
logging.basicConfig(format=formatter, level=logging.INFO)
|
204 |
+
|
205 |
+
itf1.launch()
|
model.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang)
|
2 |
+
#
|
3 |
+
# See LICENSE for clarification regarding multiple authors
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
import os
|
17 |
+
|
18 |
+
from huggingface_hub import hf_hub_download
|
19 |
+
|
20 |
+
os.system(
|
21 |
+
"cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/"
|
22 |
+
)
|
23 |
+
|
24 |
+
import sherpa # noqa
|
25 |
+
|
26 |
+
|
27 |
+
def _get_nn_model_filename(
|
28 |
+
repo_id: str,
|
29 |
+
filename: str,
|
30 |
+
subfolder: str = "exp",
|
31 |
+
) -> str:
|
32 |
+
nn_model_filename = hf_hub_download(
|
33 |
+
repo_id=repo_id,
|
34 |
+
filename=filename,
|
35 |
+
subfolder=subfolder,
|
36 |
+
)
|
37 |
+
return nn_model_filename
|
38 |
+
|
39 |
+
|
40 |
+
def _get_token_filename(
|
41 |
+
repo_id: str,
|
42 |
+
filename: str = "tokens.txt",
|
43 |
+
subfolder: str = "data/lang_char",
|
44 |
+
) -> str:
|
45 |
+
token_filename = hf_hub_download(
|
46 |
+
repo_id=repo_id,
|
47 |
+
filename=filename,
|
48 |
+
subfolder=subfolder,
|
49 |
+
)
|
50 |
+
return token_filename
|
51 |
+
|
52 |
+
|
53 |
+
def get_english_model_2022_12_19(repo_id: str):
|
54 |
+
encoder = _get_nn_model_filename(repo_id=repo_id, filename="encoder_jit_trace.pt")
|
55 |
+
decoder = _get_nn_model_filename(repo_id=repo_id, filename="decoder_jit_trace.pt")
|
56 |
+
joiner = _get_nn_model_filename(repo_id=repo_id, filename="joiner_jit_trace.pt")
|
57 |
+
tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
|
58 |
+
|
59 |
+
feat_config = sherpa.FeatureConfig()
|
60 |
+
feat_config.fbank_opts.frame_opts.samp_freq = 16000
|
61 |
+
feat_config.fbank_opts.mel_opts.num_bins = 80
|
62 |
+
feat_config.fbank_opts.frame_opts.dither = 0
|
63 |
+
|
64 |
+
config = sherpa.OnlineRecognizerConfig(
|
65 |
+
nn_model="",
|
66 |
+
encoder_model=encoder,
|
67 |
+
decoder_model=decoder,
|
68 |
+
joiner_model=joiner,
|
69 |
+
tokens=tokens,
|
70 |
+
use_gpu=False,
|
71 |
+
feat_config=feat_config,
|
72 |
+
decoding_method="greedy_search",
|
73 |
+
chunk_size=32,
|
74 |
+
)
|
75 |
+
|
76 |
+
recognizer = sherpa.OnlineRecognizer(config)
|
77 |
+
return recognizer
|
78 |
+
|
79 |
+
|
80 |
+
def get_chinese_english_mixed_model_conv_emformer_transducer_stateless2_zh(
|
81 |
+
repo_id: str,
|
82 |
+
) -> sherpa.OnlineRecognizer:
|
83 |
+
nn_model = _get_nn_model_filename(
|
84 |
+
repo_id=repo_id, filename="cpu_jit-epoch-11-avg-1.pt"
|
85 |
+
)
|
86 |
+
tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_char_bpe")
|
87 |
+
|
88 |
+
feat_config = sherpa.FeatureConfig()
|
89 |
+
feat_config.fbank_opts.frame_opts.samp_freq = 16000
|
90 |
+
feat_config.fbank_opts.mel_opts.num_bins = 80
|
91 |
+
feat_config.fbank_opts.frame_opts.dither = 0
|
92 |
+
|
93 |
+
config = sherpa.OnlineRecognizerConfig(
|
94 |
+
nn_model=nn_model,
|
95 |
+
tokens=tokens,
|
96 |
+
use_gpu=False,
|
97 |
+
feat_config=feat_config,
|
98 |
+
decoding_method="greedy_search",
|
99 |
+
)
|
100 |
+
recognizer = sherpa.OnlineRecognizer(config)
|
101 |
+
return recognizer
|
102 |
+
|
103 |
+
|
104 |
+
def get_chinese_english_mixed_model_k2fsa_zipformer_chinese_english_mixed(
|
105 |
+
repo_id: str,
|
106 |
+
) -> sherpa.OnlineRecognizer:
|
107 |
+
encoder = _get_nn_model_filename(repo_id=repo_id, filename="encoder_jit_trace.pt")
|
108 |
+
decoder = _get_nn_model_filename(repo_id=repo_id, filename="decoder_jit_trace.pt")
|
109 |
+
joiner = _get_nn_model_filename(repo_id=repo_id, filename="joiner_jit_trace.pt")
|
110 |
+
tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_char_bpe")
|
111 |
+
|
112 |
+
feat_config = sherpa.FeatureConfig()
|
113 |
+
feat_config.fbank_opts.frame_opts.samp_freq = 16000
|
114 |
+
feat_config.fbank_opts.mel_opts.num_bins = 80
|
115 |
+
feat_config.fbank_opts.frame_opts.dither = 0
|
116 |
+
|
117 |
+
config = sherpa.OnlineRecognizerConfig(
|
118 |
+
nn_model="",
|
119 |
+
encoder_model=encoder,
|
120 |
+
decoder_model=decoder,
|
121 |
+
joiner_model=joiner,
|
122 |
+
tokens=tokens,
|
123 |
+
use_gpu=False,
|
124 |
+
feat_config=feat_config,
|
125 |
+
decoding_method="greedy_search",
|
126 |
+
chunk_size=32,
|
127 |
+
)
|
128 |
+
|
129 |
+
recognizer = sherpa.OnlineRecognizer(config)
|
130 |
+
return recognizer
|
131 |
+
|
132 |
+
|
133 |
+
def create_recognizer(repo_id: str) -> sherpa.OnlineRecognizer:
|
134 |
+
if repo_id in english_models:
|
135 |
+
return english_models[repo_id](repo_id)
|
136 |
+
elif repo_id in chinese_english_mixed_models:
|
137 |
+
return chinese_english_mixed_models[repo_id](repo_id)
|
138 |
+
else:
|
139 |
+
raise ValueError(f"Unsupported repo_id: {repo_id}")
|
140 |
+
|
141 |
+
|
142 |
+
english_models = {
|
143 |
+
# https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
|
144 |
+
"Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29": get_english_model_2022_12_19
|
145 |
+
}
|
146 |
+
|
147 |
+
chinese_english_mixed_models = {
|
148 |
+
# https://huggingface.co/pfluo/k2fsa-zipformer-chinese-english-mixed
|
149 |
+
"pfluo/k2fsa-zipformer-chinese-english-mixed": get_chinese_english_mixed_model_k2fsa_zipformer_chinese_english_mixed,
|
150 |
+
# https://huggingface.co/ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh
|
151 |
+
"ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh": get_chinese_english_mixed_model_conv_emformer_transducer_stateless2_zh,
|
152 |
+
}
|
153 |
+
|
154 |
+
all_models = {
|
155 |
+
**english_models,
|
156 |
+
**chinese_english_mixed_models,
|
157 |
+
}
|
158 |
+
|
159 |
+
language_to_models = {
|
160 |
+
"English": list(english_models.keys()),
|
161 |
+
"Chinese+English": list(chinese_english_mixed_models.keys()),
|
162 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
https://download.pytorch.org/whl/cpu/torch-1.13.1%2Bcpu-cp38-cp38-linux_x86_64.whl
|
2 |
+
https://download.pytorch.org/whl/cpu/torchaudio-0.13.1%2Bcpu-cp38-cp38-linux_x86_64.whl
|
3 |
+
|
4 |
+
https://huggingface.co/csukuangfj/wheels/resolve/main/2023-01-30/k2-1.23.4.dev20230130%2Bcpu.torch1.13.1-cp38-cp38-linux_x86_64.whl
|
5 |
+
https://huggingface.co/csukuangfj/wheels/resolve/main/2023-01-30/k2_sherpa-1.1-cp38-cp38-linux_x86_64.whl
|
6 |
+
https://huggingface.co/csukuangfj/wheels/resolve/main/2023-01-30/kaldifeat-1.22-cp38-cp38-linux_x86_64.whl
|
7 |
+
|
8 |
+
sentencepiece>=0.1.96
|
9 |
+
numpy
|
10 |
+
|
11 |
+
huggingface_hub
|