csukuangfj commited on
Commit
6b31279
1 Parent(s): 500c811

small fixes

Browse files
Files changed (2) hide show
  1. app.py +63 -9
  2. model.py +49 -0
app.py CHANGED
@@ -16,14 +16,68 @@
16
  # See the License for the specific language governing permissions and
17
  # limitations under the License.
18
 
 
 
 
 
19
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  demo = gr.Blocks()
22
 
23
 
24
- def process_uploaded_file(uploaded_file: str):
25
- print("uploaded_file", uploaded_file)
26
- return "hello"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
 
29
  with demo:
@@ -36,9 +90,9 @@ with demo:
36
  optional=False,
37
  label="Upload from disk",
38
  )
39
- upload_button = gr.Button("Upload")
40
  uploaded_output = gr.outputs.Textbox(
41
- label="Recognized speech for uploaded file"
42
  )
43
 
44
  with gr.TabItem("Record from microphone"):
@@ -49,18 +103,18 @@ with demo:
49
  label="Record from microphone",
50
  )
51
  recorded_output = gr.outputs.Textbox(
52
- label="Recognized speech for recordings"
53
  )
54
 
55
- record_button = gr.Button("Record")
56
 
57
  upload_button.click(
58
- process_uploaded_file,
59
  inputs=uploaded_file,
60
  outputs=uploaded_output,
61
  )
62
  record_button.click(
63
- process_uploaded_file,
64
  inputs=microphone,
65
  outputs=recorded_output,
66
  )
 
16
  # See the License for the specific language governing permissions and
17
  # limitations under the License.
18
 
19
+ import os
20
+ import time
21
+ from datetime import datetime
22
+
23
  import gradio as gr
24
+ import torchaudio
25
+
26
+ from model import get_gigaspeech_pre_trained_model, sample_rate
27
+
28
+ models = {"english": get_gigaspeech_pre_trained_model()}
29
+
30
+
31
+ def convert_to_wav(in_filename: str) -> str:
32
+ """Convert the input audio file to a wave file"""
33
+ out_filename = in_filename + ".wav"
34
+ print(f"Converting '{in_filename}' to '{out_filename}'")
35
+ _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' '{out_filename}'")
36
+ return out_filename
37
+
38
 
39
  demo = gr.Blocks()
40
 
41
 
42
+ def process(in_filename: str) -> str:
43
+ print("in_filename", in_filename)
44
+ filename = convert_to_wav(in_filename)
45
+
46
+ now = datetime.now()
47
+ date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
48
+ print(f"Started at {date_time}")
49
+
50
+ start = time.time()
51
+ wave, wave_sample_rate = torchaudio.load(filename)
52
+
53
+ if wave_sample_rate != sample_rate:
54
+ print(
55
+ f"Expected sample rate: {sample_rate}. Given: {wave_sample_rate}. "
56
+ f"Resampling to {sample_rate}."
57
+ )
58
+
59
+ wave = torchaudio.functional.resample(
60
+ wave,
61
+ orig_freq=wave_sample_rate,
62
+ new_freq=sample_rate,
63
+ )
64
+ wave = wave[0] # use only the first channel.
65
+
66
+ hyp = models["english"].decode_waves([wave])[0]
67
+
68
+ date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
69
+ end = time.time()
70
+
71
+ duration = wave.shape[0] / sample_rate
72
+ rtf = (end - start) / duration
73
+
74
+ print(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
75
+ print(f"Duration {duration: .3f} s")
76
+ print(f"RTF {rtf: .3f}")
77
+ print("hyp")
78
+ print(hyp)
79
+
80
+ return hyp
81
 
82
 
83
  with demo:
 
90
  optional=False,
91
  label="Upload from disk",
92
  )
93
+ upload_button = gr.Button("Submit for recognition")
94
  uploaded_output = gr.outputs.Textbox(
95
+ label="Recognized speech from uploaded file"
96
  )
97
 
98
  with gr.TabItem("Record from microphone"):
 
103
  label="Record from microphone",
104
  )
105
  recorded_output = gr.outputs.Textbox(
106
+ label="Recognized speech from recordings"
107
  )
108
 
109
+ record_button = gr.Button("Submit for recordings")
110
 
111
  upload_button.click(
112
+ process,
113
  inputs=uploaded_file,
114
  outputs=uploaded_output,
115
  )
116
  record_button.click(
117
+ process,
118
  inputs=microphone,
119
  outputs=recorded_output,
120
  )
model.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang)
2
+ #
3
+ # See LICENSE for clarification regarding multiple authors
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from huggingface_hub import hf_hub_download
18
+ from functools import lru_cache
19
+
20
+
21
+ from offline_asr import OfflineAsr
22
+
23
+ sample_rate = 16000
24
+
25
+
26
+ @lru_cache(maxsize=1)
27
+ def get_gigaspeech_pre_trained_model():
28
+ nn_model_filename = hf_hub_download(
29
+ # It is converted from https://huggingface.co/wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2
30
+ repo_id="csukuangfj/icefall-asr-gigaspeech-pruned-transducer-stateless2",
31
+ filename="cpu_jit-epoch-29-avg-11-torch-1.10.0.pt",
32
+ subfolder="exp",
33
+ )
34
+
35
+ bpe_model_filename = hf_hub_download(
36
+ repo_id="wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
37
+ filename="bpe.model",
38
+ subfolder="data/lang_bpe_500",
39
+ )
40
+
41
+ return OfflineAsr(
42
+ nn_model_filename=nn_model_filename,
43
+ bpe_model_filename=bpe_model_filename,
44
+ token_filename=None,
45
+ decoding_method="greedy_search",
46
+ num_active_paths=4,
47
+ sample_rate=sample_rate,
48
+ device="cpu",
49
+ )