File size: 3,204 Bytes
aca9f3d
feb2a2b
 
 
6b93fd2
51f7123
feb2a2b
 
 
 
 
 
 
6b93fd2
feb2a2b
1c6b627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feb2a2b
6b93fd2
 
feb2a2b
 
44daa8d
 
feb2a2b
44daa8d
 
 
1c6b627
 
 
 
6b93fd2
44daa8d
51f7123
 
3702096
44daa8d
d71b5df
44daa8d
feb2a2b
6b93fd2
5b4ea6e
 
 
abfa68a
5b4ea6e
feb2a2b
 
 
1c6b627
 
 
 
 
 
 
 
 
 
 
 
44daa8d
1022fd5
 
44daa8d
cff8d27
 
6b93fd2
 
1c6b627
feb2a2b
 
5b4ea6e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from pprint import pformat

from huggingface_hub import hf_hub_download

import librosa

import gradio as gr

from pipeline import PreTrainedPipeline


HF_HUB_URL = 'ales/wav2vec2-cv-be'
LM_HUB_FP = 'language_model/cv8be_5gram.bin'
MODEL_SAMPLING_RATE = 16_000  # 16kHz

# download Language Model from HF Hub
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)

# init pipeline
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)


def main(recorded_audio_fp: str, uploaded_audio_fp: str):
    audio_fp = None
    if recorded_audio_fp is not None:
        audio_fp = recorded_audio_fp
        used_audiofile = 'recorded'
    elif uploaded_audio_fp is not None:
        audio_fp = uploaded_audio_fp
        used_audiofile = 'uploaded'
    else:
        return (
            'Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.',
            'Error! You have to either record or upload an audiofile.'
        )

    # read audio file
    inputs = librosa.load(audio_fp, sr=MODEL_SAMPLING_RATE, mono=True)[0]

    # recognize speech
    pipeline_res = pipeline(inputs=inputs)
    text = pipeline_res['text'][0]  # unpack batch of size 1

    # add technical information to the output
    tech_data = pipeline_res
    del tech_data['text']
    tech_data['used_audiofile'] = used_audiofile
    tech_data['recorded_file_present'] = recorded_audio_fp is not None
    tech_data['uploaded_file_present'] = uploaded_audio_fp is not None
    tech_data['audiofile_path'] = audio_fp
    tech_data['model_sampling_rate'] = MODEL_SAMPLING_RATE
    tech_data['inputs_shape'] = inputs.shape
    tech_data['inputs_max'] = inputs.max().item()
    tech_data['inputs_min'] = inputs.min().item()

    tech_data_str = pformat(tech_data)

    return text, tech_data_str


article = """
The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/ales/wav2vec2-cv-be)

![Page Visits](https://visitor-badge.glitch.me/badge?page_id=huggingface.co/spaces/ales/wav2vec2-cv-be-lm&left_color=darkgray&right_color=crimson&left_text=Page%20Visits)
"""

iface = gr.Interface(
    fn=main,
    inputs=[
        gr.inputs.Audio(
            source='microphone', type='filepath',
            label='Запішыце аўдыяфайл, каб распазнаць маўленьне',
            optional=True,
        ),
        gr.inputs.Audio(
            source='upload', type='filepath',
            label='Альбо загрузіце ўжо запісаны аўдыяфайл сюды',
            optional=True
        ),
    ],
    outputs=[
        gr.outputs.Textbox(type='str', label='Распазнаны тэкст'),
        gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя')
    ],
    title='wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model',
    description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n'
                 'Акустычная мадэль + моўная мадэль.'
                 ),
    article=article
)

iface.launch(enable_queue=True)