Yehor Smoliakov commited on
Commit
bd540a9
1 Parent(s): 9cece8a

Refactor the app

Browse files
Files changed (15) hide show
  1. README.md +1 -1
  2. app.py +117 -31
  3. example_1.wav +0 -0
  4. example_2.wav +0 -0
  5. example_3.wav +0 -0
  6. example_4.wav +0 -0
  7. example_5.wav +0 -0
  8. example_6.wav +0 -0
  9. requirements.txt +3 -0
  10. sample_1.wav +0 -3
  11. sample_2.wav +0 -3
  12. sample_3.wav +0 -3
  13. sample_4.wav +0 -3
  14. sample_5.wav +0 -3
  15. sample_6.wav +0 -3
README.md CHANGED
@@ -11,7 +11,7 @@ pinned: true
11
  ## Install
12
 
13
  ```shell
14
- uv venv --python 3.12
15
 
16
  source .venv/bin/activate
17
 
 
11
  ## Install
12
 
13
  ```shell
14
+ uv venv --python 3.11
15
 
16
  source .venv/bin/activate
17
 
app.py CHANGED
@@ -1,59 +1,129 @@
 
1
  import time
2
 
3
  import torch
 
4
  import librosa
5
 
6
  import gradio as gr
7
 
8
  from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
9
 
 
10
  model_name = "Yehor/w2v-bert-2.0-uk-v2"
11
- device = "cpu"
12
- max_duration = 30
13
 
14
- asr_model = AutoModelForCTC.from_pretrained(model_name).to(device)
 
 
 
 
 
 
 
15
  processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
16
 
17
- audio_samples = [
18
- "sample_1.wav",
19
- "sample_2.wav",
20
- "sample_3.wav",
21
- "sample_4.wav",
22
- "sample_5.wav",
23
- "sample_6.wav",
 
 
 
 
24
  ]
25
 
26
- description_head = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # Speech-to-Text for Ukrainian v2
28
 
29
  ## Overview
30
 
31
- This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2 model that solves
32
- a Speech-to-Text task for the Ukrainian language.
 
33
  """.strip()
34
 
35
- description_foot = """
36
  ## Community
37
 
38
- - Join our Discord server - https://discord.gg/yVAjkBgmt4 - where we're talking about Data Science,
39
- Machine Learning, Deep Learning, and Artificial Intelligence.
40
 
41
- - Join our Speech Recognition Group in Telegram: https://t.me/speech_recognition_uk
42
 
43
- ## Authors
 
 
 
 
 
 
 
 
 
44
 
45
- Yehor Smoliakov: https://github.com/egorsmkv on GitHub, and egorsmkv@gmail.com for private discussions.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  """.strip()
47
 
48
 
49
  def inference(audio_path, progress=gr.Progress()):
50
- gr.Info("Starting process", duration=2)
 
51
 
52
- progress(0, desc="Starting")
 
 
53
 
54
  duration = librosa.get_duration(path=audio_path)
 
 
55
  if duration > max_duration:
56
- raise gr.Error("The duration of the file exceeds 10 seconds.")
57
 
58
  paths = [
59
  audio_path,
@@ -70,12 +140,18 @@ def inference(audio_path, progress=gr.Progress()):
70
  features = processor([audio_input], sampling_rate=16_000).input_features
71
  features = torch.tensor(features).to(device)
72
 
 
 
 
73
  with torch.inference_mode():
74
  logits = asr_model(features).logits
75
 
76
  predicted_ids = torch.argmax(logits, dim=-1)
77
  predictions = processor.batch_decode(predicted_ids)
78
 
 
 
 
79
  elapsed_time = round(time.time() - t0, 2)
80
  rtf = round(elapsed_time / audio_duration, 4)
81
  audio_duration = round(audio_duration, 2)
@@ -89,7 +165,7 @@ def inference(audio_path, progress=gr.Progress()):
89
  }
90
  )
91
 
92
- gr.Info("Finished...", duration=2)
93
 
94
  result_texts = []
95
 
@@ -113,24 +189,34 @@ demo = gr.Blocks(
113
  with demo:
114
  gr.Markdown(description_head)
115
 
116
- gr.Markdown(f"## Demo (max. duration: **{max_duration}** seconds)")
117
 
118
  with gr.Row():
119
  audio_file = gr.Audio(label="Audio file", type="filepath")
120
  transcription = gr.Markdown(
121
  label="Transcription",
122
- value="Recognized text will appear here. Use **an example file** below the Recognize button,"
123
- "upload **your audio file**, or use **the microphone** to record something...",
124
  )
125
 
126
- gr.Button("Recognize").click(inference, inputs=audio_file, outputs=transcription)
 
 
 
 
 
127
 
128
  with gr.Row():
129
- gr.Examples(
130
- label="Choose an example audio", inputs=audio_file, examples=audio_samples
131
- )
132
 
133
  gr.Markdown(description_foot)
134
-
 
 
 
 
 
135
  if __name__ == "__main__":
 
136
  demo.launch()
 
1
+ import sys
2
  import time
3
 
4
  import torch
5
+ import torchaudio
6
  import librosa
7
 
8
  import gradio as gr
9
 
10
  from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
11
 
12
+ # Config
13
  model_name = "Yehor/w2v-bert-2.0-uk-v2"
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ torch_dtype = torch.float16
16
 
17
+ min_duration = 0.5
18
+ max_duration = 60
19
+
20
+ concurrency_limit = 1
21
+ use_torch_compile = False
22
+
23
+ # Load the model
24
+ asr_model = AutoModelForCTC.from_pretrained(model_name, torch_dtype=torch_dtype).to(device)
25
  processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
26
 
27
+ if use_torch_compile:
28
+ asr_model = torch.compile(asr_model)
29
+
30
+ # Elements
31
+ examples = [
32
+ "example_1.wav",
33
+ "example_2.wav",
34
+ "example_3.wav",
35
+ "example_4.wav",
36
+ "example_5.wav",
37
+ "example_6.wav",
38
  ]
39
 
40
+ examples_table = '''
41
+ | File | Text |
42
+ | ------------- | ------------- |
43
+ | `example_1.wav` | тема про яку не люблять говорити офіційні джерела у генштабі і міноборони це хімічна зброя окупанти вже тривалий час використовують хімічну зброю заборонену |
44
+ | `example_2.wav` | всіма конвенціями якщо спочатку це були гранати з дронів то тепер фіксують випадки застосування |
45
+ | `example_3.wav` | хімічних снарядів причому склад отруйної речовони різний а отже й наслідки для наших військових теж різні |
46
+ | `example_4.wav` | використовує на фронті все що має і хімічна зброя не нийняток тож з чим маємо справу розбиралася марія моганисян |
47
+ | `example_5.wav` | двох тисяч випадків застосування росіянами боєприпасів споряджених небезпечними хімічними речовинами |
48
+ | `example_6.wav` | на всі писані норми марія моганисян олександр моторний спецкор марафон єдині новини |
49
+ '''.strip()
50
+
51
+ # https://www.tablesgenerator.com/markdown_tables
52
+ authors_table = '''
53
+ ## Authors
54
+
55
+ Follow them in social networks and **contact** if you need any help or have any questions:
56
+
57
+ | <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
58
+ |-------------------------------------------------------------------------------------------------|
59
+ | https://t.me/smlkw in Telegram |
60
+ | https://x.com/yehor_smoliakov at X |
61
+ | https://github.com/egorsmkv at GitHub |
62
+ | https://huggingface.co/Yehor at Hugging Face |
63
+ | or use egorsmkv@gmail.com |
64
+ '''.strip()
65
+
66
+ description_head = f"""
67
  # Speech-to-Text for Ukrainian v2
68
 
69
  ## Overview
70
 
71
+ This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2 model to recognize audio files.
72
+
73
+ > For demo, audio duration **must not** exceed **{max_duration}** seconds.
74
  """.strip()
75
 
76
+ description_foot = f"""
77
  ## Community
78
 
79
+ - Join our Discord server where we talk about AI/ML/DL: https://discord.gg/yVAjkBgmt4
80
+ - Join our Speech Recognition group in Telegram: https://t.me/speech_recognition_uk
81
 
82
+ ## More
83
 
84
+ Check out other ASR models: https://github.com/egorsmkv/speech-recognition-uk
85
+
86
+ {authors_table}
87
+ """.strip()
88
+
89
+ transcription_value = """
90
+ Recognized text will appear here.
91
+
92
+ Choose **an example file** below the Recognize button, upload **your audio file**, or use **the microphone** to record something.
93
+ """.strip()
94
 
95
+ tech_env = f"""
96
+ #### Environment
97
+
98
+ - Python: {sys.version}
99
+ - Torch device: {device}
100
+ - Torch dtype: {torch_dtype}
101
+ - Use torch.compile: {use_torch_compile}
102
+ """.strip()
103
+
104
+ tech_libraries = f"""
105
+ #### Libraries
106
+
107
+ - PyTorch: {torch.__version__}
108
+ - Transformers: {torch.__version__}
109
+ - Librosa: {librosa.version.version}
110
+ - Gradio: {gr.__version__}
111
  """.strip()
112
 
113
 
114
  def inference(audio_path, progress=gr.Progress()):
115
+ if not audio_path:
116
+ raise gr.Error("Please upload an audio file.")
117
 
118
+ gr.Info("Starting recognition", duration=2)
119
+
120
+ progress(0, desc="Recognizing")
121
 
122
  duration = librosa.get_duration(path=audio_path)
123
+ if duration < min_duration:
124
+ raise gr.Error(f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds.")
125
  if duration > max_duration:
126
+ raise gr.Error(f"The duration of the file exceeds {max_duration} seconds.")
127
 
128
  paths = [
129
  audio_path,
 
140
  features = processor([audio_input], sampling_rate=16_000).input_features
141
  features = torch.tensor(features).to(device)
142
 
143
+ if torch_dtype == torch.float16:
144
+ features = features.half()
145
+
146
  with torch.inference_mode():
147
  logits = asr_model(features).logits
148
 
149
  predicted_ids = torch.argmax(logits, dim=-1)
150
  predictions = processor.batch_decode(predicted_ids)
151
 
152
+ if not predictions:
153
+ predictions = '-'
154
+
155
  elapsed_time = round(time.time() - t0, 2)
156
  rtf = round(elapsed_time / audio_duration, 4)
157
  audio_duration = round(audio_duration, 2)
 
165
  }
166
  )
167
 
168
+ gr.Info("Finished!", duration=2)
169
 
170
  result_texts = []
171
 
 
189
  with demo:
190
  gr.Markdown(description_head)
191
 
192
+ gr.Markdown("## Demo")
193
 
194
  with gr.Row():
195
  audio_file = gr.Audio(label="Audio file", type="filepath")
196
  transcription = gr.Markdown(
197
  label="Transcription",
198
+ value=transcription_value,
 
199
  )
200
 
201
+ gr.Button("Recognize").click(
202
+ inference,
203
+ concurrency_limit=concurrency_limit,
204
+ inputs=audio_file,
205
+ outputs=transcription,
206
+ )
207
 
208
  with gr.Row():
209
+ gr.Examples(label="Choose an example", inputs=audio_file, examples=examples)
210
+
211
+ gr.Markdown(examples_table)
212
 
213
  gr.Markdown(description_foot)
214
+
215
+ gr.Markdown('### Gradio app uses the following technologies:')
216
+ with gr.Row():
217
+ gr.Markdown(tech_env)
218
+ gr.Markdown(tech_libraries)
219
+
220
  if __name__ == "__main__":
221
+ demo.queue()
222
  demo.launch()
example_1.wav ADDED
Binary file (273 kB). View file
 
example_2.wav ADDED
Binary file (200 kB). View file
 
example_3.wav ADDED
Binary file (193 kB). View file
 
example_4.wav ADDED
Binary file (241 kB). View file
 
example_5.wav ADDED
Binary file (193 kB). View file
 
example_6.wav ADDED
Binary file (186 kB). View file
 
requirements.txt CHANGED
@@ -3,6 +3,9 @@ gradio
3
  torch
4
  torchaudio
5
 
 
 
 
6
  transformers
7
 
8
  librosa
 
3
  torch
4
  torchaudio
5
 
6
+ triton
7
+ setuptools
8
+
9
  transformers
10
 
11
  librosa
sample_1.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:172ade978b299f4a0c47e3b76666d1a06161e6001fbb5591b82038a1bbc4b5ad
3
- size 272568
 
 
 
 
sample_2.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:98fe42f22f8ea632714081a958dc035f3d507523fd340b320a1223ac2f55ccac
3
- size 199942
 
 
 
 
sample_3.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:83c0b7375beada8cee74b5de226da494368fcc6a3ce692913b3302dcda0bd9a2
3
- size 192842
 
 
 
 
sample_4.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:19e466ee9c0c129c1eecf93eb6791a44c2ee8d68dce2c3e8fd3734b87f28324a
3
- size 241442
 
 
 
 
sample_5.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5af19120c92859846a08496e0a617c21877cae2db5807d211f0a431d95163a3e
3
- size 193388
 
 
 
 
sample_6.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac877968d5749438930339497f7548046003390a848496136f6cbe8a74c51629
3
- size 186290