Spaces:
Running
Running
optimized model loading, added file upload widget
Browse files
app.py
CHANGED
@@ -13,17 +13,30 @@ HF_HUB_URL = 'ales/wav2vec2-cv-be'
|
|
13 |
LM_HUB_FP = 'language_model/cv8be_5gram.bin'
|
14 |
MODEL_SAMPLING_RATE = 16_000 # 16kHz
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
def main(audio_fp: str):
|
18 |
# read audio file
|
19 |
inputs = librosa.load(audio_fp, sr=MODEL_SAMPLING_RATE, mono=True)[0]
|
20 |
|
21 |
-
# download Language Model from HF Hub
|
22 |
-
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
|
23 |
-
|
24 |
-
# init pipeline
|
25 |
-
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
|
26 |
-
|
27 |
# recognize speech
|
28 |
pipeline_res = pipeline(inputs=inputs)
|
29 |
text = pipeline_res['text'][0] # unpack batch of size 1
|
@@ -31,6 +44,10 @@ def main(audio_fp: str):
|
|
31 |
# add technical information to the output
|
32 |
tech_data = pipeline_res
|
33 |
del tech_data['text']
|
|
|
|
|
|
|
|
|
34 |
tech_data['model_sampling_rate'] = MODEL_SAMPLING_RATE
|
35 |
tech_data['inputs_shape'] = inputs.shape
|
36 |
tech_data['inputs_max'] = inputs.max().item()
|
@@ -49,10 +66,18 @@ The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/a
|
|
49 |
|
50 |
iface = gr.Interface(
|
51 |
fn=main,
|
52 |
-
inputs=
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
outputs=[
|
57 |
gr.outputs.Textbox(type='str', label='Распазнаны тэкст'),
|
58 |
gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя')
|
@@ -61,7 +86,7 @@ iface = gr.Interface(
|
|
61 |
description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n'
|
62 |
'Акустычная мадэль + моўная мадэль.'
|
63 |
),
|
64 |
-
article=article
|
65 |
)
|
66 |
|
67 |
iface.launch(enable_queue=True)
|
|
|
13 |
LM_HUB_FP = 'language_model/cv8be_5gram.bin'
|
14 |
MODEL_SAMPLING_RATE = 16_000 # 16kHz
|
15 |
|
16 |
+
# download Language Model from HF Hub
|
17 |
+
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
|
18 |
+
|
19 |
+
# init pipeline
|
20 |
+
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
|
21 |
+
|
22 |
+
|
23 |
+
def main(recorded_audio_fp: str, uploaded_audio_fp: str):
|
24 |
+
audio_fp = None
|
25 |
+
if recorded_audio_fp is not None:
|
26 |
+
audio_fp = recorded_audio_fp
|
27 |
+
used_audiofile = 'recorded'
|
28 |
+
elif uploaded_audio_fp is not None:
|
29 |
+
audio_fp = uploaded_audio_fp
|
30 |
+
used_audiofile = 'uploaded'
|
31 |
+
else:
|
32 |
+
return (
|
33 |
+
'Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.',
|
34 |
+
'Error! You have to either record or upload an audiofile.'
|
35 |
+
)
|
36 |
|
|
|
37 |
# read audio file
|
38 |
inputs = librosa.load(audio_fp, sr=MODEL_SAMPLING_RATE, mono=True)[0]
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
# recognize speech
|
41 |
pipeline_res = pipeline(inputs=inputs)
|
42 |
text = pipeline_res['text'][0] # unpack batch of size 1
|
|
|
44 |
# add technical information to the output
|
45 |
tech_data = pipeline_res
|
46 |
del tech_data['text']
|
47 |
+
tech_data['used_audiofile'] = used_audiofile
|
48 |
+
tech_data['recorded_file_present'] = recorded_audio_fp is not None
|
49 |
+
tech_data['uploaded_file_present'] = uploaded_audio_fp is not None
|
50 |
+
tech_data['audiofile_path'] = audio_fp
|
51 |
tech_data['model_sampling_rate'] = MODEL_SAMPLING_RATE
|
52 |
tech_data['inputs_shape'] = inputs.shape
|
53 |
tech_data['inputs_max'] = inputs.max().item()
|
|
|
66 |
|
67 |
iface = gr.Interface(
|
68 |
fn=main,
|
69 |
+
inputs=[
|
70 |
+
gr.inputs.Audio(
|
71 |
+
source='microphone', type='filepath',
|
72 |
+
label='Запішыце аўдыяфайл, каб распазнаць маўленьне',
|
73 |
+
optional=True,
|
74 |
+
),
|
75 |
+
gr.inputs.Audio(
|
76 |
+
source='upload', type='filepath',
|
77 |
+
label='Альбо загрузіце ўжо запісаны аўдыяфайл сюды',
|
78 |
+
optional=True
|
79 |
+
),
|
80 |
+
],
|
81 |
outputs=[
|
82 |
gr.outputs.Textbox(type='str', label='Распазнаны тэкст'),
|
83 |
gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя')
|
|
|
86 |
description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n'
|
87 |
'Акустычная мадэль + моўная мадэль.'
|
88 |
),
|
89 |
+
article=article
|
90 |
)
|
91 |
|
92 |
iface.launch(enable_queue=True)
|