Spaces:
Running
Running
Yurii Paniv
commited on
Commit
•
14485b0
1
Parent(s):
9da9a4e
Add side-by-side comparison
Browse files- README.md +5 -0
- app.py +34 -37
- requirements-local.txt +2 -0
- requirements.txt +0 -1
README.md
CHANGED
@@ -4,6 +4,7 @@ emoji: 🐌
|
|
4 |
colorFrom: blue
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
|
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
---
|
@@ -12,6 +13,7 @@ pinned: false
|
|
12 |
This is a repository with aim to apply various speech recognition models on Ukrainian language.
|
13 |
|
14 |
You can see online demo here: https://huggingface.co/spaces/robinhad/ukrainian-stt.
|
|
|
15 |
Source code is in this repository together with auto-deploy pipeline scripts.
|
16 |
|
17 |
|
@@ -30,6 +32,9 @@ If you'd like to check out different models for Ukrainian language, please visit
|
|
30 |
# 🤖 Training scripts
|
31 |
Guides for training are available in corresponding folders for each model.
|
32 |
|
|
|
|
|
|
|
33 |
# 🤝 Attribution
|
34 |
[@robinhad](https://github.com/robinhad) - model training.
|
35 |
[@egorsmkv](https://github.com/egorsmkv) - organized [Ukrainian Speech recognition community](https://github.com/egorsmkv/speech-recognition-uk).
|
|
|
4 |
colorFrom: blue
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.2
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
13 |
This is a repository with aim to apply various speech recognition models on Ukrainian language.
|
14 |
|
15 |
You can see online demo here: https://huggingface.co/spaces/robinhad/ukrainian-stt.
|
16 |
+
Github link: https://github.com/robinhad/voice-recognition-ua.
|
17 |
Source code is in this repository together with auto-deploy pipeline scripts.
|
18 |
|
19 |
|
|
|
32 |
# 🤖 Training scripts
|
33 |
Guides for training are available in corresponding folders for each model.
|
34 |
|
35 |
+
# Support
|
36 |
+
If you like my work, please support here: https://send.monobank.ua/jar/48iHq4xAXm
|
37 |
+
|
38 |
# 🤝 Attribution
|
39 |
[@robinhad](https://github.com/robinhad) - model training.
|
40 |
[@egorsmkv](https://github.com/egorsmkv) - organized [Ukrainian Speech recognition community](https://github.com/egorsmkv/speech-recognition-uk).
|
app.py
CHANGED
@@ -9,10 +9,6 @@ from os.path import exists
|
|
9 |
from stt import Model
|
10 |
from datetime import datetime
|
11 |
|
12 |
-
MODEL_NAMES = [
|
13 |
-
"No scorer",
|
14 |
-
"With scorer"
|
15 |
-
]
|
16 |
|
17 |
# download model
|
18 |
version = "v0.4"
|
@@ -22,15 +18,17 @@ scorer_name = "kenlm.scorer"
|
|
22 |
model_link = f"{storage_url}/{model_name}"
|
23 |
scorer_link = f"{storage_url}/{scorer_name}"
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
def client(audio_data: np.array, sample_rate: int, use_scorer=False):
|
27 |
-
output_audio = _convert_audio(audio_data, sample_rate)
|
28 |
-
|
29 |
-
fin = wave.open(output_audio, 'rb')
|
30 |
-
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
|
31 |
-
|
32 |
-
fin.close()
|
33 |
|
|
|
34 |
ds = Model(model_name)
|
35 |
if use_scorer:
|
36 |
ds.enableExternalScorer("kenlm.scorer")
|
@@ -40,28 +38,30 @@ def client(audio_data: np.array, sample_rate: int, use_scorer=False):
|
|
40 |
return result
|
41 |
|
42 |
|
43 |
-
def
|
44 |
-
|
45 |
-
|
46 |
-
r = requests.get(url, allow_redirects=True)
|
47 |
-
with open(file_name, 'wb') as file:
|
48 |
-
file.write(r.content)
|
49 |
-
else:
|
50 |
-
print(f"Found {file_name}. Skipping download...")
|
51 |
-
|
52 |
|
53 |
-
|
54 |
-
sample_rate, audio = audio
|
55 |
-
print(f"Input sample rate: {sample_rate}. Audio file length: {round(audio.shape[0]/sample_rate ,2)}")
|
56 |
-
use_scorer = True if model_name == "With scorer" else False
|
57 |
|
58 |
-
|
59 |
-
|
|
|
60 |
|
61 |
-
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
def _convert_audio(audio_data: np.array, sample_rate: int):
|
|
|
|
|
|
|
65 |
source_audio = BytesIO()
|
66 |
source_audio.write(audio_data)
|
67 |
source_audio.seek(0)
|
@@ -76,23 +76,20 @@ def _convert_audio(audio_data: np.array, sample_rate: int):
|
|
76 |
output_audio.seek(0)
|
77 |
return output_audio
|
78 |
|
|
|
|
|
79 |
|
80 |
iface = gr.Interface(
|
81 |
-
fn=
|
82 |
inputs=[
|
83 |
gr.inputs.Audio(type="numpy",
|
84 |
-
label=
|
85 |
-
gr.inputs.Radio(
|
86 |
-
label="Виберіть Speech-to-Text модель",
|
87 |
-
choices=MODEL_NAMES,
|
88 |
-
),
|
89 |
-
|
90 |
],
|
91 |
-
outputs=gr.outputs.Textbox(label="
|
92 |
-
title="
|
93 |
theme="huggingface",
|
94 |
description="Україномовний🇺🇦 Speech-to-Text за допомогою Coqui STT",
|
95 |
-
article=
|
96 |
)
|
97 |
|
98 |
download(model_link, model_name)
|
|
|
9 |
from stt import Model
|
10 |
from datetime import datetime
|
11 |
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# download model
|
14 |
version = "v0.4"
|
|
|
18 |
model_link = f"{storage_url}/{model_name}"
|
19 |
scorer_link = f"{storage_url}/{scorer_name}"
|
20 |
|
21 |
+
def download(url, file_name):
|
22 |
+
if not exists(file_name):
|
23 |
+
print(f"Downloading {file_name}")
|
24 |
+
r = requests.get(url, allow_redirects=True)
|
25 |
+
with open(file_name, 'wb') as file:
|
26 |
+
file.write(r.content)
|
27 |
+
else:
|
28 |
+
print(f"Found {file_name}. Skipping download...")
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
def deepspeech(audio: np.array, use_scorer=False):
|
32 |
ds = Model(model_name)
|
33 |
if use_scorer:
|
34 |
ds.enableExternalScorer("kenlm.scorer")
|
|
|
38 |
return result
|
39 |
|
40 |
|
41 |
+
def inference(audio: Tuple[int, np.array]):
|
42 |
+
print("=============================")
|
43 |
+
print(f"Time: {datetime.utcnow()}.`")
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
output_audio = _convert_audio(audio[1], audio[0])
|
|
|
|
|
|
|
46 |
|
47 |
+
fin = wave.open(output_audio, 'rb')
|
48 |
+
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
|
49 |
+
fin.close()
|
50 |
|
51 |
+
transcripts = []
|
52 |
|
53 |
+
transcripts.append("")
|
54 |
+
transcripts.append(deepspeech(audio, use_scorer=True))
|
55 |
+
print(f"Deepspeech with LM: `{transcripts[-1]}`")
|
56 |
+
transcripts.append(deepspeech(audio))
|
57 |
+
print(f"Deepspeech: `{transcripts[-1]}`")
|
58 |
+
return tuple(transcripts)
|
59 |
+
|
60 |
|
61 |
def _convert_audio(audio_data: np.array, sample_rate: int):
|
62 |
+
audio_limit = sample_rate * 60 * 2 # limit audio to 2 minutes max
|
63 |
+
if audio_data.shape[0] > audio_limit:
|
64 |
+
audio_data = audio_data[0:audio_limit]
|
65 |
source_audio = BytesIO()
|
66 |
source_audio.write(audio_data)
|
67 |
source_audio.seek(0)
|
|
|
76 |
output_audio.seek(0)
|
77 |
return output_audio
|
78 |
|
79 |
+
with open("README.md") as file:
|
80 |
+
article = file.read()
|
81 |
|
82 |
iface = gr.Interface(
|
83 |
+
fn=inference,
|
84 |
inputs=[
|
85 |
gr.inputs.Audio(type="numpy",
|
86 |
+
label="Аудіо", optional=False),
|
|
|
|
|
|
|
|
|
|
|
87 |
],
|
88 |
+
outputs=[gr.outputs.Textbox(label="Wav2Vec2"), gr.outputs.Textbox(label="DeepSpeech with LM"), gr.outputs.Textbox(label="DeepSpeech")],
|
89 |
+
title="🇺🇦 Ukrainian Speech-to-Text models",
|
90 |
theme="huggingface",
|
91 |
description="Україномовний🇺🇦 Speech-to-Text за допомогою Coqui STT",
|
92 |
+
article=article,
|
93 |
)
|
94 |
|
95 |
download(model_link, model_name)
|
requirements-local.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
-r requirements.txt
|
2 |
+
gradio==3.2
|
requirements.txt
CHANGED
@@ -1,3 +1,2 @@
|
|
1 |
-
gradio==2.4.5
|
2 |
STT==1.3.0
|
3 |
pydub==0.25.1
|
|
|
|
|
1 |
STT==1.3.0
|
2 |
pydub==0.25.1
|