csukuangfj commited on
Commit
ee6ba22
1 Parent(s): 3a4d643

Add test data.

Browse files
app.py CHANGED
@@ -27,6 +27,7 @@ from datetime import datetime
27
  import gradio as gr
28
  import torchaudio
29
 
 
30
  from model import get_pretrained_model, language_to_models, sample_rate
31
 
32
  languages = list(language_to_models.keys())
@@ -51,11 +52,11 @@ def build_html_output(s: str, style: str = "result_item_success"):
51
 
52
 
53
  def process_uploaded_file(
54
- in_filename: str,
55
  language: str,
56
  repo_id: str,
57
  decoding_method: str,
58
  num_active_paths: int,
 
59
  ):
60
  if in_filename is None or in_filename == "":
61
  return "", build_html_output(
@@ -79,11 +80,11 @@ def process_uploaded_file(
79
 
80
 
81
  def process_microphone(
82
- in_filename: str,
83
  language: str,
84
  repo_id: str,
85
  decoding_method: str,
86
  num_active_paths: int,
 
87
  ):
88
  if in_filename is None or in_filename == "":
89
  return "", build_html_output(
@@ -108,17 +109,17 @@ def process_microphone(
108
 
109
 
110
  def process(
111
- in_filename: str,
112
  language: str,
113
  repo_id: str,
114
  decoding_method: str,
115
  num_active_paths: int,
 
116
  ):
117
- logging.info(f"in_filename: {in_filename}")
118
  logging.info(f"language: {language}")
119
  logging.info(f"repo_id: {repo_id}")
120
  logging.info(f"decoding_method: {decoding_method}")
121
  logging.info(f"num_active_paths: {num_active_paths}")
 
122
 
123
  filename = convert_to_wav(in_filename)
124
 
@@ -210,6 +211,7 @@ def update_model_dropdown(language: str):
210
 
211
  demo = gr.Blocks(css=css)
212
 
 
213
  with demo:
214
  gr.Markdown(title)
215
  language_choices = list(language_to_models.keys())
@@ -256,6 +258,19 @@ with demo:
256
  uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
257
  uploaded_html_info = gr.HTML(label="Info")
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  with gr.TabItem("Record from microphone"):
260
  microphone = gr.Audio(
261
  source="microphone", # Choose between "microphone", "upload"
@@ -268,25 +283,39 @@ with demo:
268
  recorded_output = gr.Textbox(label="Recognized speech from recordings")
269
  recorded_html_info = gr.HTML(label="Info")
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  upload_button.click(
272
  process_uploaded_file,
273
  inputs=[
274
- uploaded_file,
275
  language_radio,
276
  model_dropdown,
277
  decoding_method_radio,
278
  num_active_paths_slider,
 
279
  ],
280
  outputs=[uploaded_output, uploaded_html_info],
281
  )
 
282
  record_button.click(
283
  process_microphone,
284
  inputs=[
285
- microphone,
286
  language_radio,
287
  model_dropdown,
288
  decoding_method_radio,
289
  num_active_paths_slider,
 
290
  ],
291
  outputs=[recorded_output, recorded_html_info],
292
  )
 
27
  import gradio as gr
28
  import torchaudio
29
 
30
+ from examples import examples
31
  from model import get_pretrained_model, language_to_models, sample_rate
32
 
33
  languages = list(language_to_models.keys())
 
52
 
53
 
54
  def process_uploaded_file(
 
55
  language: str,
56
  repo_id: str,
57
  decoding_method: str,
58
  num_active_paths: int,
59
+ in_filename: str,
60
  ):
61
  if in_filename is None or in_filename == "":
62
  return "", build_html_output(
 
80
 
81
 
82
  def process_microphone(
 
83
  language: str,
84
  repo_id: str,
85
  decoding_method: str,
86
  num_active_paths: int,
87
+ in_filename: str,
88
  ):
89
  if in_filename is None or in_filename == "":
90
  return "", build_html_output(
 
109
 
110
 
111
  def process(
 
112
  language: str,
113
  repo_id: str,
114
  decoding_method: str,
115
  num_active_paths: int,
116
+ in_filename: str,
117
  ):
 
118
  logging.info(f"language: {language}")
119
  logging.info(f"repo_id: {repo_id}")
120
  logging.info(f"decoding_method: {decoding_method}")
121
  logging.info(f"num_active_paths: {num_active_paths}")
122
+ logging.info(f"in_filename: {in_filename}")
123
 
124
  filename = convert_to_wav(in_filename)
125
 
 
211
 
212
  demo = gr.Blocks(css=css)
213
 
214
+
215
  with demo:
216
  gr.Markdown(title)
217
  language_choices = list(language_to_models.keys())
 
258
  uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
259
  uploaded_html_info = gr.HTML(label="Info")
260
 
261
+ gr.Examples(
262
+ examples=examples,
263
+ inputs=[
264
+ language_radio,
265
+ model_dropdown,
266
+ decoding_method_radio,
267
+ num_active_paths_slider,
268
+ uploaded_file,
269
+ ],
270
+ outputs=[uploaded_output, uploaded_html_info],
271
+ fn=process_uploaded_file,
272
+ )
273
+
274
  with gr.TabItem("Record from microphone"):
275
  microphone = gr.Audio(
276
  source="microphone", # Choose between "microphone", "upload"
 
283
  recorded_output = gr.Textbox(label="Recognized speech from recordings")
284
  recorded_html_info = gr.HTML(label="Info")
285
 
286
+ gr.Examples(
287
+ examples=examples,
288
+ inputs=[
289
+ language_radio,
290
+ model_dropdown,
291
+ decoding_method_radio,
292
+ num_active_paths_slider,
293
+ microphone,
294
+ ],
295
+ outputs=[recorded_output, recorded_html_info],
296
+ fn=process_microphone,
297
+ )
298
+
299
  upload_button.click(
300
  process_uploaded_file,
301
  inputs=[
 
302
  language_radio,
303
  model_dropdown,
304
  decoding_method_radio,
305
  num_active_paths_slider,
306
+ uploaded_file,
307
  ],
308
  outputs=[uploaded_output, uploaded_html_info],
309
  )
310
+
311
  record_button.click(
312
  process_microphone,
313
  inputs=[
 
314
  language_radio,
315
  model_dropdown,
316
  decoding_method_radio,
317
  num_active_paths_slider,
318
+ microphone,
319
  ],
320
  outputs=[recorded_output, recorded_html_info],
321
  )
examples.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ #
3
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
4
+ #
5
+ # See LICENSE for clarification regarding multiple authors
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ examples = [
19
+ # librispeech
20
+ # https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13/tree/main/test_wavs
21
+ [
22
+ "English",
23
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
24
+ "greedy_search",
25
+ 4,
26
+ "./test_wavs/librispeech/1089-134686-0001.wav",
27
+ ],
28
+ [
29
+ "English",
30
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
31
+ "greedy_search",
32
+ 4,
33
+ "./test_wavs/librispeech/1221-135766-0001.wav",
34
+ ],
35
+ [
36
+ "English",
37
+ "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",
38
+ "greedy_search",
39
+ 4,
40
+ "./test_wavs/librispeech/1221-135766-0002.wav",
41
+ ],
42
+ # gigaspeech
43
+ [
44
+ "English",
45
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
46
+ "greedy_search",
47
+ 4,
48
+ "./test_wavs/gigaspeech/1-minute-audiobook.opus",
49
+ ],
50
+ [
51
+ "English",
52
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
53
+ "greedy_search",
54
+ 4,
55
+ "./test_wavs/gigaspeech/100-seconds-podcast.opus",
56
+ ],
57
+ [
58
+ "English",
59
+ "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
60
+ "greedy_search",
61
+ 4,
62
+ "./test_wavs/gigaspeech/100-seconds-youtube.opus",
63
+ ],
64
+ # wenetspeech
65
+ # https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2/tree/main/test_wavs
66
+ [
67
+ "Chinese",
68
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
69
+ "greedy_search",
70
+ 4,
71
+ "./test_wavs/wenetspeech/DEV_T0000000000.opus",
72
+ ],
73
+ [
74
+ "Chinese",
75
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
76
+ "greedy_search",
77
+ 4,
78
+ "./test_wavs/wenetspeech/DEV_T0000000001.opus",
79
+ ],
80
+ [
81
+ "Chinese",
82
+ "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
83
+ "greedy_search",
84
+ 4,
85
+ "./test_wavs/wenetspeech/DEV_T0000000002.opus",
86
+ ],
87
+ # aishell2-A
88
+ # https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12/tree/main/test_wavs
89
+ [
90
+ "Chinese",
91
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",
92
+ "greedy_search",
93
+ 4,
94
+ "./test_wavs/aishell2/ID0012W0030.wav",
95
+ ],
96
+ [
97
+ "Chinese",
98
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",
99
+ "greedy_search",
100
+ 4,
101
+ "./test_wavs/aishell2/ID0012W0162.wav",
102
+ ],
103
+ [
104
+ "Chinese",
105
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12",
106
+ "greedy_search",
107
+ 4,
108
+ "./test_wavs/aishell2/ID0012W0215.wav",
109
+ ],
110
+ # aishell2-B
111
+ # https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12/tree/main/test_wavs
112
+ [
113
+ "Chinese",
114
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",
115
+ "greedy_search",
116
+ 4,
117
+ "./test_wavs/aishell2/ID0012W0030.wav",
118
+ ],
119
+ [
120
+ "Chinese",
121
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",
122
+ "greedy_search",
123
+ 4,
124
+ "./test_wavs/aishell2/ID0012W0162.wav",
125
+ ],
126
+ [
127
+ "Chinese",
128
+ "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12",
129
+ "greedy_search",
130
+ 4,
131
+ "./test_wavs/aishell2/ID0012W0215.wav",
132
+ ],
133
+ # tal_csasr
134
+ # https://huggingface.co/luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5/tree/main/test_wavs
135
+ [
136
+ "Chinese+English",
137
+ "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
138
+ "greedy_search",
139
+ 4,
140
+ "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_132.wav",
141
+ ],
142
+ [
143
+ "Chinese+English",
144
+ "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
145
+ "greedy_search",
146
+ 4,
147
+ "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_138.wav",
148
+ ],
149
+ [
150
+ "Chinese+English",
151
+ "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
152
+ "greedy_search",
153
+ 4,
154
+ "./test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav",
155
+ ],
156
+ ]
model.py CHANGED
@@ -81,7 +81,7 @@ def _get_aishell2_pretrained_model(repo_id: str) -> OfflineAsr:
81
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12", # noqa
82
  # context-size 2
83
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12", # noqa
84
- ]
85
 
86
  nn_model_filename = _get_nn_model_filename(
87
  repo_id=repo_id,
@@ -102,12 +102,11 @@ def _get_aishell2_pretrained_model(repo_id: str) -> OfflineAsr:
102
  def _get_gigaspeech_pre_trained_model(repo_id: str) -> OfflineAsr:
103
  assert repo_id in [
104
  "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
105
- ]
106
 
107
  nn_model_filename = _get_nn_model_filename(
108
- # It is converted from https://huggingface.co/wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2 # noqa
109
- repo_id="csukuangfj/icefall-asr-gigaspeech-pruned-transducer-stateless2", # noqa
110
- filename="cpu_jit-epoch-29-avg-11-torch-1.10.0.pt",
111
  )
112
  bpe_model_filename = _get_bpe_model_filename(repo_id=repo_id)
113
 
@@ -124,7 +123,7 @@ def _get_gigaspeech_pre_trained_model(repo_id: str) -> OfflineAsr:
124
  def _get_librispeech_pre_trained_model(repo_id: str) -> OfflineAsr:
125
  assert repo_id in [
126
  "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", # noqa
127
- ]
128
 
129
  nn_model_filename = _get_nn_model_filename(
130
  repo_id=repo_id,
@@ -145,7 +144,7 @@ def _get_librispeech_pre_trained_model(repo_id: str) -> OfflineAsr:
145
  def _get_wenetspeech_pre_trained_model(repo_id: str):
146
  assert repo_id in [
147
  "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
148
- ]
149
 
150
  nn_model_filename = _get_nn_model_filename(
151
  repo_id=repo_id,
@@ -166,7 +165,7 @@ def _get_wenetspeech_pre_trained_model(repo_id: str):
166
  def _get_tal_csasr_pre_trained_model(repo_id: str):
167
  assert repo_id in [
168
  "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
169
- ]
170
 
171
  nn_model_filename = _get_nn_model_filename(
172
  repo_id=repo_id,
@@ -187,7 +186,7 @@ def _get_tal_csasr_pre_trained_model(repo_id: str):
187
  def _get_alimeeting_pre_trained_model(repo_id: str):
188
  assert repo_id in [
189
  "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
190
- ]
191
 
192
  nn_model_filename = _get_nn_model_filename(
193
  repo_id=repo_id,
 
81
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12", # noqa
82
  # context-size 2
83
  "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12", # noqa
84
+ ], repo_id
85
 
86
  nn_model_filename = _get_nn_model_filename(
87
  repo_id=repo_id,
 
102
  def _get_gigaspeech_pre_trained_model(repo_id: str) -> OfflineAsr:
103
  assert repo_id in [
104
  "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
105
+ ], repo_id
106
 
107
  nn_model_filename = _get_nn_model_filename(
108
+ repo_id=repo_id,
109
+ filename="cpu_jit-iter-3488000-avg-20.pt",
 
110
  )
111
  bpe_model_filename = _get_bpe_model_filename(repo_id=repo_id)
112
 
 
123
  def _get_librispeech_pre_trained_model(repo_id: str) -> OfflineAsr:
124
  assert repo_id in [
125
  "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", # noqa
126
+ ], repo_id
127
 
128
  nn_model_filename = _get_nn_model_filename(
129
  repo_id=repo_id,
 
144
  def _get_wenetspeech_pre_trained_model(repo_id: str):
145
  assert repo_id in [
146
  "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2",
147
+ ], repo_id
148
 
149
  nn_model_filename = _get_nn_model_filename(
150
  repo_id=repo_id,
 
165
  def _get_tal_csasr_pre_trained_model(repo_id: str):
166
  assert repo_id in [
167
  "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5",
168
+ ], repo_id
169
 
170
  nn_model_filename = _get_nn_model_filename(
171
  repo_id=repo_id,
 
186
  def _get_alimeeting_pre_trained_model(repo_id: str):
187
  assert repo_id in [
188
  "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
189
+ ], repo_id
190
 
191
  nn_model_filename = _get_nn_model_filename(
192
  repo_id=repo_id,
test_wavs/aishell2/ID0012W0030.wav ADDED
Binary file (113 kB). View file
 
test_wavs/aishell2/ID0012W0162.wav ADDED
Binary file (114 kB). View file
 
test_wavs/aishell2/ID0012W0215.wav ADDED
Binary file (104 kB). View file
 
test_wavs/aishell2/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Files are downloaded from
2
+ https://huggingface.co/yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12/tree/main/test_wavs
test_wavs/aishell2/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ID0012W0162 立法机关采纳了第二种意见
2
+ ID0012W0215 大家都愿意牺牲自己的生命
3
+ ID0012W0030 完全是典型的军事侵略
test_wavs/gigaspeech/1-minute-audiobook.opus ADDED
Binary file (580 kB). View file
 
test_wavs/gigaspeech/100-seconds-podcast.opus ADDED
Binary file (955 kB). View file
 
test_wavs/gigaspeech/100-seconds-youtube.opus ADDED
Binary file (948 kB). View file
 
test_wavs/librispeech/1089-134686-0001.wav ADDED
Binary file (212 kB). View file
 
test_wavs/librispeech/1221-135766-0001.wav ADDED
Binary file (535 kB). View file
 
test_wavs/librispeech/1221-135766-0002.wav ADDED
Binary file (154 kB). View file
 
test_wavs/librispeech/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Files are downloaded from
2
+ https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13/tree/main/test_wavs
test_wavs/librispeech/trans.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 1089-134686-0001 AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
2
+ 1221-135766-0001 GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
3
+ 1221-135766-0002 YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_132.wav ADDED
Binary file (163 kB). View file
 
test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_138.wav ADDED
Binary file (150 kB). View file
 
test_wavs/tal_csasr/210_36476_210_8341_1_1533271973_7057520_145.wav ADDED
Binary file (283 kB). View file
 
test_wavs/tal_csasr/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Files are downloaded from
2
+ https://huggingface.co/luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5/tree/main/test_wavs
test_wavs/wenetspeech/DEV_T0000000000.opus ADDED
Binary file (23.1 kB). View file
 
test_wavs/wenetspeech/DEV_T0000000001.opus ADDED
Binary file (21.5 kB). View file
 
test_wavs/wenetspeech/DEV_T0000000002.opus ADDED
Binary file (18.8 kB). View file
 
test_wavs/wenetspeech/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Files are downloaded from
2
+ https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2/tree/main/test_wavs