smajumdar commited on
Commit
c21743b
1 Parent(s): de75b59

Add support for gradio blocks for better GUI

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. app.py +138 -0
  3. packages.txt +2 -0
  4. requirements.txt +1 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Nemo Multilingual Language Id
3
- emoji: 🏢
4
  colorFrom: blue
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.7
8
  app_file: app.py
 
1
  ---
2
  title: Nemo Multilingual Language Id
3
+ emoji: 🐠
4
  colorFrom: blue
5
+ colorTo: gray
6
  sdk: gradio
7
  sdk_version: 3.7
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+
4
+ import nemo.collections.asr as nemo_asr
5
+
6
+ SAMPLE_RATE = 16000
7
+ TITLE = "NeMo ASR Inference on Hugging Face"
8
+ DESCRIPTION = "Demo of all languages supported by NeMo ASR"
9
+ DEFAULT_EN_MODEL = "nvidia/stt_en_conformer_transducer_xlarge"
10
+
11
+ MARKDOWN = f"""
12
+ # {TITLE}
13
+
14
+ ## {DESCRIPTION}
15
+ """
16
+
17
+ CSS = """
18
+ p.big {
19
+ font-size: 20px;
20
+ }
21
+ """
22
+
23
+ ARTICLE = """
24
+ <br><br>
25
+ <p class='big' style='text-align: center'>
26
+ <a href='https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/intro.html' target='_blank'>NeMo ASR</a>
27
+ |
28
+ <a href='https://github.com/NVIDIA/NeMo#nvidia-nemo' target='_blank'>Github Repo</a>
29
+ </p>
30
+ """
31
+
32
+ SUPPORTED_LANGUAGES = set([])
33
+ SUPPORTED_MODEL_NAMES = set([])
34
+
35
+ # HF models
36
+ hf_filter = nemo_asr.models.ASRModel.get_hf_model_filter()
37
+ hf_filter.task = "automatic-speech-recognition"
38
+
39
+ hf_infos = nemo_asr.models.ASRModel.search_huggingface_models(model_filter=hf_filter)
40
+ for info in hf_infos:
41
+ lang_id = info.modelId.split("_")[1] # obtains lang id as str
42
+ SUPPORTED_LANGUAGES.add(lang_id)
43
+ SUPPORTED_MODEL_NAMES.add(info.modelId)
44
+
45
+ SUPPORTED_MODEL_NAMES = sorted(list(SUPPORTED_MODEL_NAMES))
46
+
47
+ model_dict = {model_name: gr.Interface.load(f'models/{model_name}') for model_name in SUPPORTED_MODEL_NAMES}
48
+
49
+ SUPPORTED_LANG_MODEL_DICT = {}
50
+ for lang in SUPPORTED_LANGUAGES:
51
+ for model_id in SUPPORTED_MODEL_NAMES:
52
+ if lang in model_id:
53
+ # create new lang in dict
54
+ if lang not in SUPPORTED_LANG_MODEL_DICT:
55
+ SUPPORTED_LANG_MODEL_DICT[lang] = [model_id]
56
+ else:
57
+ SUPPORTED_LANG_MODEL_DICT[lang].append(model_id)
58
+
59
+ # Sort model names
60
+ for lang in SUPPORTED_LANG_MODEL_DICT.keys():
61
+ model_ids = SUPPORTED_LANG_MODEL_DICT[lang]
62
+ model_ids = sorted(model_ids)
63
+ SUPPORTED_LANG_MODEL_DICT[lang] = model_ids
64
+
65
+
66
+ def transcribe(microphone, audio_file, model_name):
67
+ model = model_dict[model_name]
68
+
69
+ warn_output = ""
70
+ if (microphone is not None) and (audio_file is not None):
71
+ warn_output = (
72
+ "WARNING: You've uploaded an audio file and used the microphone. "
73
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
74
+ )
75
+ audio_data = microphone
76
+
77
+ elif (microphone is None) and (audio_file is None):
78
+ return "ERROR: You have to either use the microphone or upload an audio file"
79
+
80
+ elif microphone is not None:
81
+ audio_data = microphone
82
+ else:
83
+ audio_data = audio_file
84
+
85
+ try:
86
+ # Use HF API for transcription
87
+ transcriptions = model(audio_data)
88
+
89
+ except Exception as e:
90
+ transcriptions = ""
91
+ warn_output = warn_output + "\n\n"
92
+ warn_output += (
93
+ f"The model `{model_name}` is currently loading and cannot be used "
94
+ f"for transcription.\n"
95
+ f"Please try another model or wait a few minutes."
96
+ )
97
+
98
+ return warn_output + transcriptions
99
+
100
+
101
+ demo = gr.Blocks(title=TITLE, css=CSS)
102
+
103
+ with demo:
104
+ header = gr.Markdown(MARKDOWN)
105
+
106
+ with gr.Row() as row:
107
+ file_upload = gr.components.Audio(source="upload", type='filepath', label='Upload File')
108
+ microphone = gr.components.Audio(source="microphone", type='filepath', label='Microphone')
109
+
110
+ lang_selector = gr.components.Dropdown(
111
+ choices=sorted(list(SUPPORTED_LANGUAGES)), value="en", type="value", label="Languages", interactive=True,
112
+ )
113
+ models_in_lang = gr.components.Dropdown(
114
+ choices=sorted(list(SUPPORTED_LANG_MODEL_DICT["en"])),
115
+ value=DEFAULT_EN_MODEL,
116
+ label="Models",
117
+ interactive=True,
118
+ )
119
+
120
+ def update_models_with_lang(lang):
121
+ models_names = sorted(list(SUPPORTED_LANG_MODEL_DICT[lang]))
122
+ default = models_names[0]
123
+
124
+ if lang == 'en':
125
+ default = DEFAULT_EN_MODEL
126
+ return models_in_lang.update(choices=models_names, value=default)
127
+
128
+ lang_selector.change(update_models_with_lang, inputs=[lang_selector], outputs=[models_in_lang])
129
+
130
+ transcript = gr.components.Label(label='Transcript')
131
+
132
+ run = gr.components.Button('Transcribe')
133
+ run.click(transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript])
134
+
135
+ gr.components.HTML(ARTICLE)
136
+
137
+ demo.queue(concurrency_count=1)
138
+ demo.launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ libsndfile1
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ nemo_toolkit[asr]