thanhtvt commited on
Commit
a0dfd75
·
1 Parent(s): edc41a0

demo of v0.1.0-beta release

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__/
2
+ checkpoints/
3
+ vocabs/
4
+ *.yaml
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import logging
4
+ import os
5
+ import soundfile as sf
6
+ import tensorflow as tf
7
+
8
+ from datetime import datetime
9
+ from time import time
10
+
11
+ from examples import examples
12
+ from model import UETASRModel
13
+
14
+
15
+ def get_duration(filename: str) -> float:
16
+ return librosa.get_duration(filename=filename)
17
+
18
+
19
+ def convert_to_wav(in_filename: str) -> str:
20
+ out_filename = os.path.splitext(in_filename)[0] + ".wav"
21
+ logging.info(f"Converting {in_filename} to {out_filename}")
22
+ y, sr = librosa.load(in_filename, sr=16000)
23
+ sf.write(out_filename, y, sr)
24
+ return out_filename
25
+
26
+
27
+ def build_html_output(s: str, style: str = "result_item_success"):
28
+ return f"""
29
+ <div class='result'>
30
+ <div class='result_item {style}'>
31
+ {s}
32
+ </div>
33
+ </div>
34
+ """
35
+
36
+
37
+ def process_uploaded_file(in_filename: str):
38
+ if in_filename is None or in_filename == "":
39
+ return "", build_html_output(
40
+ "Please first upload a file and then click "
41
+ 'the button "submit for recognition"',
42
+ "result_item_error",
43
+ )
44
+
45
+ logging.info(f"Processing uploaded file: {in_filename}")
46
+ try:
47
+ return process(in_filename=in_filename)
48
+ except Exception as e:
49
+ logging.error(str(e))
50
+ return "", build_html_output(str(e), "result_item_error")
51
+
52
+
53
+ def process_microphone(in_filename: str):
54
+ if in_filename is None or in_filename == "":
55
+ return "", build_html_output(
56
+ "Please first upload a file and then click "
57
+ 'the button "submit for recognition"',
58
+ "result_item_error",
59
+ )
60
+
61
+ logging.info(f"Processing microphone: {in_filename}")
62
+ try:
63
+ return process(in_filename=in_filename)
64
+ except Exception as e:
65
+ logging.error(str(e))
66
+ return "", build_html_output(str(e), "result_item_error")
67
+
68
+
69
+ def process(in_filename: str):
70
+ logging.info(f"in_filename: {in_filename}")
71
+
72
+ filename = convert_to_wav(in_filename)
73
+
74
+ now = datetime.now()
75
+ date_time = now.strftime("%d/%m/%Y, %H:%M:%S.%f")
76
+ logging.info(f"Started at {date_time}")
77
+
78
+ repo_id = "thanhtvt/uetasr-conformer_30.3m"
79
+
80
+ start = time()
81
+
82
+ recognizer = UETASRModel(repo_id)
83
+ text = recognizer.predict(filename)
84
+
85
+ date_time = now.strftime("%d/%m/%Y, %H:%M:%S.%f")
86
+ end = time()
87
+
88
+ duration = get_duration(filename)
89
+ rtf = (end - start) / duration
90
+
91
+ logging.info(f"Finished at {date_time} s. Elapsed: {end - start: .3f} s")
92
+
93
+ info = f"""
94
+ Wave duration : {duration: .3f} s <br/>
95
+ Processing time: {end - start: .3f} s <br/>
96
+ RTF: {end - start: .3f}/{duration: .3f} = {rtf:.3f} <br/>
97
+ """
98
+ if rtf > 1:
99
+ info += (
100
+ "<br/>We are loading the model for the first run. "
101
+ "Please run again to measure the real RTF.<br/>"
102
+ )
103
+
104
+ logging.info(info)
105
+
106
+ return text, build_html_output(info)
107
+
108
+
109
+ title = "Vietnamese Automatic Speech Recognition with UETASR"
110
+ description = """
111
+ This space shows how to use UETASR for Vietnamese Automatic Speech Recognition.
112
+
113
+ It is running on CPU provided by Hugging Face 🤗
114
+
115
+ See more information by visiting the [Github repository](https://github.com/thanhtvt/uetasr/)
116
+ """
117
+
118
+ # css style is copied from
119
+ # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
120
+ css = """
121
+ .result {display:flex;flex-direction:column}
122
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
123
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
124
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
125
+ """
126
+
127
+ demo = gr.Blocks(css=css)
128
+
129
+
130
+ with demo:
131
+ gr.Markdown(title)
132
+
133
+ with gr.Tabs():
134
+ with gr.TabItem("Upload from disk"):
135
+ uploaded_file = gr.Audio(
136
+ source="upload", # Choose between "microphone", "upload"
137
+ type="filepath",
138
+ label="Upload from disk",
139
+ )
140
+ upload_button = gr.Button("Submit for recognition")
141
+ uploaded_output = gr.Textbox(label="Recognized speech from uploaded file")
142
+ uploaded_html_info = gr.HTML(label="Info")
143
+
144
+ gr.Examples(
145
+ examples=examples,
146
+ inputs=uploaded_file,
147
+ outputs=[uploaded_output, uploaded_html_info],
148
+ fn=process_uploaded_file,
149
+ )
150
+
151
+ with gr.TabItem("Record from microphone"):
152
+ microphone = gr.Audio(
153
+ source="microphone",
154
+ type="filepath",
155
+ label="Record from microphone",
156
+ )
157
+
158
+ record_button = gr.Button("Submit for recognition")
159
+ recorded_output = gr.Textbox(label="Recognized speech from recordings")
160
+ recorded_html_info = gr.HTML(label="Info")
161
+
162
+ gr.Examples(
163
+ examples=examples,
164
+ inputs=microphone,
165
+ outputs=[uploaded_output, uploaded_html_info],
166
+ fn=process_microphone,
167
+ )
168
+
169
+ upload_button.click(
170
+ process_uploaded_file,
171
+ inputs=uploaded_file,
172
+ outputs=[uploaded_output, uploaded_html_info],
173
+ )
174
+
175
+ record_button.click(
176
+ process_microphone,
177
+ inputs=microphone,
178
+ outputs=[recorded_output, recorded_html_info],
179
+ )
180
+ gr.Markdown(description)
181
+
182
+
183
+ if __name__ == "__main__":
184
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
185
+
186
+ logging.basicConfig(format=formatter, level=logging.INFO)
187
+
188
+ demo.launch(share=True)
examples.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ examples = [
2
+ "./test_wavs/2022_1004_00001300_00002239.wav",
3
+ "./test_wavs/2022_1004_00087158_00087929.wav",
4
+ "./test_wavs/2022_1008_00110083_00110571.wav",
5
+ ]
model.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tensorflow as tf
3
+ from functools import lru_cache
4
+ from huggingface_hub import hf_hub_download
5
+ from hyperpyyaml import load_hyperpyyaml
6
+ from typing import Union
7
+
8
+ os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
9
+
10
+
11
+ def _get_checkpoint_filename(
12
+ repo_id: str,
13
+ filename: str,
14
+ local_dir: str = None,
15
+ local_dir_use_symlinks: Union[bool, str] = "auto",
16
+ subfolder: str = "checkpoints"
17
+ ) -> str:
18
+ model_filename = hf_hub_download(
19
+ repo_id=repo_id,
20
+ filename=filename,
21
+ subfolder=subfolder,
22
+ local_dir=local_dir,
23
+ local_dir_use_symlinks=local_dir_use_symlinks,
24
+ )
25
+ return model_filename
26
+
27
+
28
+ def _get_bpe_model_filename(
29
+ repo_id: str,
30
+ filename: str,
31
+ local_dir: str = None,
32
+ local_dir_use_symlinks: Union[bool, str] = "auto",
33
+ subfolder: str = "vocabs"
34
+ ) -> str:
35
+ bpe_model_filename = hf_hub_download(
36
+ repo_id=repo_id,
37
+ filename=filename,
38
+ subfolder=subfolder,
39
+ local_dir=local_dir,
40
+ local_dir_use_symlinks=local_dir_use_symlinks,
41
+ )
42
+ return bpe_model_filename
43
+
44
+
45
+ @lru_cache(maxsize=1)
46
+ def _get_conformer_pre_trained_model(repo_id: str, checkpoint_dir: str = "checkpoints"):
47
+ for postfix in ["index", "data-00000-of-00001"]:
48
+ tmp = _get_checkpoint_filename(
49
+ repo_id=repo_id,
50
+ filename="avg_top5_27-32.ckpt.{}".format(postfix),
51
+ subfolder=checkpoint_dir,
52
+ local_dir=os.path.dirname(__file__), # noqa
53
+ local_dir_use_symlinks=True,
54
+ )
55
+ print(tmp)
56
+
57
+ for postfix in ["model", "vocab"]:
58
+ tmp = _get_bpe_model_filename(
59
+ repo_id=repo_id,
60
+ filename="subword_vietnamese_500.{}".format(postfix),
61
+ local_dir=os.path.dirname(__file__), # noqa
62
+ local_dir_use_symlinks=True,
63
+ )
64
+ print(tmp)
65
+
66
+ config_path = hf_hub_download(
67
+ repo_id=repo_id,
68
+ filename="config.yaml",
69
+ local_dir=os.path.dirname(__file__), # noqa
70
+ local_dir_use_symlinks=True,
71
+ )
72
+ print(config_path)
73
+ with open(config_path, "r") as f:
74
+ config = load_hyperpyyaml(f)
75
+
76
+ encoder_model = config["encoder_model"]
77
+ searcher = config["decoder"]
78
+ model = config["model"]
79
+ audio_encoder = config["audio_encoder"]
80
+ model.load_weights(os.path.join(checkpoint_dir, "avg_top5_27-32.ckpt")).expect_partial()
81
+
82
+ return audio_encoder, encoder_model, searcher, model
83
+
84
+
85
+ def read_audio(in_filename: str):
86
+ audio = tf.io.read_file(in_filename)
87
+ audio = tf.audio.decode_wav(audio)[0]
88
+ audio = tf.expand_dims(tf.squeeze(audio, axis=-1), axis=0)
89
+ return audio
90
+
91
+
92
+ class UETASRModel:
93
+ def __init__(self, repo_id: str):
94
+ self.featurizer, self.encoder_model, self.searcher, self.model = _get_conformer_pre_trained_model(repo_id)
95
+
96
+ def predict(self, in_filename: str):
97
+ inputs = read_audio(in_filename)
98
+ features = self.featurizer(inputs)
99
+ features = self.model.cmvn(features) if self.model.use_cmvn else features
100
+
101
+ batch_size = tf.shape(features)[0]
102
+ dim = tf.shape(features)[-1]
103
+ mask = tf.sequence_mask([tf.shape(features)[1]], maxlen=tf.shape(features)[1])
104
+ mask = tf.expand_dims(mask, axis=1)
105
+ encoder_outputs, encoder_masks = self.encoder_model(
106
+ features, mask, training=False)
107
+
108
+ encoder_mask = tf.squeeze(encoder_masks, axis=1)
109
+ features_length = tf.math.reduce_sum(
110
+ tf.cast(encoder_mask, tf.int32),
111
+ axis=1
112
+ )
113
+
114
+ outputs = self.searcher(encoder_outputs, features_length)
115
+ outputs = tf.compat.as_str_any(outputs.numpy())
116
+
117
+ return outputs
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ uetasr @ git+https://github.com/thanhtvt/uetasr@v0.1.0-beta
2
+ librosa
3
+ requests==2.28.2
test_wavs/2022_1004_00001300_00002239.wav ADDED
Binary file (301 kB). View file
 
test_wavs/2022_1004_00087158_00087929.wav ADDED
Binary file (247 kB). View file
 
test_wavs/2022_1008_00110083_00110571.wav ADDED
Binary file (156 kB). View file