Spaces:

ales
/

wav2vec2-cv-be-lm

Running

App Files Files Community

ales commited on Apr 14, 2022

Commit

1c6b627

•

1 Parent(s): 6b93fd2

optimized model loading, added file upload widget

Browse files

Files changed (1) hide show

app.py +37 -12

app.py CHANGED Viewed

@@ -13,17 +13,30 @@ HF_HUB_URL = 'ales/wav2vec2-cv-be'
 LM_HUB_FP = 'language_model/cv8be_5gram.bin'
 MODEL_SAMPLING_RATE = 16_000  # 16kHz
-def main(audio_fp: str):
     # read audio file
     inputs = librosa.load(audio_fp, sr=MODEL_SAMPLING_RATE, mono=True)[0]
-    # download Language Model from HF Hub
-    lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
-    # init pipeline
-    pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
     # recognize speech
     pipeline_res = pipeline(inputs=inputs)
     text = pipeline_res['text'][0]  # unpack batch of size 1
@@ -31,6 +44,10 @@ def main(audio_fp: str):
     # add technical information to the output
     tech_data = pipeline_res
     del tech_data['text']
     tech_data['model_sampling_rate'] = MODEL_SAMPLING_RATE
     tech_data['inputs_shape'] = inputs.shape
     tech_data['inputs_max'] = inputs.max().item()
@@ -49,10 +66,18 @@ The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/a
 iface = gr.Interface(
     fn=main,
-    inputs=gr.inputs.Audio(
-        source='microphone', type='filepath',
-        label='Запішыце аўдыяфайл, каб распазнаць маўленьне'
-    ),
     outputs=[
         gr.outputs.Textbox(type='str', label='Распазнаны тэкст'),
         gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя')
@@ -61,7 +86,7 @@ iface = gr.Interface(
     description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n'
                  'Акустычная мадэль + моўная мадэль.'
                  ),
-    article=article,
 )
 iface.launch(enable_queue=True)

 LM_HUB_FP = 'language_model/cv8be_5gram.bin'
 MODEL_SAMPLING_RATE = 16_000  # 16kHz
+# download Language Model from HF Hub
+lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
+# init pipeline
+pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
+def main(recorded_audio_fp: str, uploaded_audio_fp: str):
+    audio_fp = None
+    if recorded_audio_fp is not None:
+        audio_fp = recorded_audio_fp
+        used_audiofile = 'recorded'
+    elif uploaded_audio_fp is not None:
+        audio_fp = uploaded_audio_fp
+        used_audiofile = 'uploaded'
+    else:
+        return (
+            'Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.',
+            'Error! You have to either record or upload an audiofile.'
+        )
     # read audio file
     inputs = librosa.load(audio_fp, sr=MODEL_SAMPLING_RATE, mono=True)[0]
     # recognize speech
     pipeline_res = pipeline(inputs=inputs)
     text = pipeline_res['text'][0]  # unpack batch of size 1
     # add technical information to the output
     tech_data = pipeline_res
     del tech_data['text']
+    tech_data['used_audiofile'] = used_audiofile
+    tech_data['recorded_file_present'] = recorded_audio_fp is not None
+    tech_data['uploaded_file_present'] = uploaded_audio_fp is not None
+    tech_data['audiofile_path'] = audio_fp
     tech_data['model_sampling_rate'] = MODEL_SAMPLING_RATE
     tech_data['inputs_shape'] = inputs.shape
     tech_data['inputs_max'] = inputs.max().item()
 iface = gr.Interface(
     fn=main,
+    inputs=[
+        gr.inputs.Audio(
+            source='microphone', type='filepath',
+            label='Запішыце аўдыяфайл, каб распазнаць маўленьне',
+            optional=True,
+        ),
+        gr.inputs.Audio(
+            source='upload', type='filepath',
+            label='Альбо загрузіце ўжо запісаны аўдыяфайл сюды',
+            optional=True
+        ),
+    ],
     outputs=[
         gr.outputs.Textbox(type='str', label='Распазнаны тэкст'),
         gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя')
     description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n'
                  'Акустычная мадэль + моўная мадэль.'
                  ),
+    article=article
 )
 iface.launch(enable_queue=True)