File size: 4,321 Bytes
c21743b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89e9618
c21743b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141a4e7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
import torch

import nemo.collections.asr as nemo_asr

SAMPLE_RATE = 16000
TITLE = "NeMo ASR Inference on Hugging Face"
DESCRIPTION = "Demo of all languages supported by NeMo ASR"
DEFAULT_EN_MODEL = "nvidia/stt_en_conformer_transducer_xlarge"

MARKDOWN = f"""
# {TITLE}

## {DESCRIPTION}
"""

CSS = """
p.big {
  font-size: 20px;
}
"""

ARTICLE = """
<br><br>
<p class='big' style='text-align: center'>
    <a href='https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/intro.html' target='_blank'>NeMo ASR</a> 
    | 
    <a href='https://github.com/NVIDIA/NeMo#nvidia-nemo' target='_blank'>Github Repo</a>
</p>
"""

SUPPORTED_LANGUAGES = set([])
SUPPORTED_MODEL_NAMES = set([])

# HF models
hf_filter = nemo_asr.models.ASRModel.get_hf_model_filter()
hf_filter.task = "automatic-speech-recognition"

hf_infos = nemo_asr.models.ASRModel.search_huggingface_models(model_filter=hf_filter)
for info in hf_infos:
    lang_id = info.modelId.split("_")[1]  # obtains lang id as str
    SUPPORTED_LANGUAGES.add(lang_id)
    SUPPORTED_MODEL_NAMES.add(info.modelId)

SUPPORTED_MODEL_NAMES = sorted(list(SUPPORTED_MODEL_NAMES))

model_dict = {model_name: gr.Interface.load(f'models/{model_name}') for model_name in SUPPORTED_MODEL_NAMES}

SUPPORTED_LANG_MODEL_DICT = {}
for lang in SUPPORTED_LANGUAGES:
    for model_id in SUPPORTED_MODEL_NAMES:
        if ("_" + lang + "_") in model_id:
            # create new lang in dict
            if lang not in SUPPORTED_LANG_MODEL_DICT:
                SUPPORTED_LANG_MODEL_DICT[lang] = [model_id]
            else:
                SUPPORTED_LANG_MODEL_DICT[lang].append(model_id)

# Sort model names
for lang in SUPPORTED_LANG_MODEL_DICT.keys():
    model_ids = SUPPORTED_LANG_MODEL_DICT[lang]
    model_ids = sorted(model_ids)
    SUPPORTED_LANG_MODEL_DICT[lang] = model_ids


def transcribe(microphone, audio_file, model_name):
    model = model_dict[model_name]

    warn_output = ""
    if (microphone is not None) and (audio_file is not None):
        warn_output = (
            "WARNING: You've uploaded an audio file and used the microphone. "
            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
        )
        audio_data = microphone

    elif (microphone is None) and (audio_file is None):
        return "ERROR: You have to either use the microphone or upload an audio file"

    elif microphone is not None:
        audio_data = microphone
    else:
        audio_data = audio_file

    try:
        # Use HF API for transcription
        transcriptions = model(audio_data)

    except Exception as e:
        transcriptions = ""
        warn_output = warn_output + "\n\n"
        warn_output += (
            f"The model `{model_name}` is currently loading and cannot be used "
            f"for transcription.\n"
            f"Please try another model or wait a few minutes."
        )

    return warn_output + transcriptions


demo = gr.Blocks(title=TITLE, css=CSS)

with demo:
    header = gr.Markdown(MARKDOWN)

    with gr.Row() as row:
        file_upload = gr.components.Audio(source="upload", type='filepath', label='Upload File')
        microphone = gr.components.Audio(source="microphone", type='filepath', label='Microphone')

    lang_selector = gr.components.Dropdown(
        choices=sorted(list(SUPPORTED_LANGUAGES)), value="en", type="value", label="Languages", interactive=True,
    )
    models_in_lang = gr.components.Dropdown(
        choices=sorted(list(SUPPORTED_LANG_MODEL_DICT["en"])),
        value=DEFAULT_EN_MODEL,
        label="Models",
        interactive=True,
    )

    def update_models_with_lang(lang):
        models_names = sorted(list(SUPPORTED_LANG_MODEL_DICT[lang]))
        default = models_names[0]

        if lang == 'en':
            default = DEFAULT_EN_MODEL
        return models_in_lang.update(choices=models_names, value=default)

    lang_selector.change(update_models_with_lang, inputs=[lang_selector], outputs=[models_in_lang])

    transcript = gr.components.Label(label='Transcript')

    run = gr.components.Button('Transcribe')
    run.click(transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript])

    gr.components.HTML(ARTICLE)

demo.queue(concurrency_count=1)
demo.launch(share=True)