Spaces:
Runtime error
Runtime error
File size: 4,321 Bytes
c21743b 89e9618 c21743b 141a4e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
import torch
import nemo.collections.asr as nemo_asr
SAMPLE_RATE = 16000
TITLE = "NeMo ASR Inference on Hugging Face"
DESCRIPTION = "Demo of all languages supported by NeMo ASR"
DEFAULT_EN_MODEL = "nvidia/stt_en_conformer_transducer_xlarge"
MARKDOWN = f"""
# {TITLE}
## {DESCRIPTION}
"""
CSS = """
p.big {
font-size: 20px;
}
"""
ARTICLE = """
<br><br>
<p class='big' style='text-align: center'>
<a href='https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/intro.html' target='_blank'>NeMo ASR</a>
|
<a href='https://github.com/NVIDIA/NeMo#nvidia-nemo' target='_blank'>Github Repo</a>
</p>
"""
SUPPORTED_LANGUAGES = set([])
SUPPORTED_MODEL_NAMES = set([])
# HF models
hf_filter = nemo_asr.models.ASRModel.get_hf_model_filter()
hf_filter.task = "automatic-speech-recognition"
hf_infos = nemo_asr.models.ASRModel.search_huggingface_models(model_filter=hf_filter)
for info in hf_infos:
lang_id = info.modelId.split("_")[1] # obtains lang id as str
SUPPORTED_LANGUAGES.add(lang_id)
SUPPORTED_MODEL_NAMES.add(info.modelId)
SUPPORTED_MODEL_NAMES = sorted(list(SUPPORTED_MODEL_NAMES))
model_dict = {model_name: gr.Interface.load(f'models/{model_name}') for model_name in SUPPORTED_MODEL_NAMES}
SUPPORTED_LANG_MODEL_DICT = {}
for lang in SUPPORTED_LANGUAGES:
for model_id in SUPPORTED_MODEL_NAMES:
if ("_" + lang + "_") in model_id:
# create new lang in dict
if lang not in SUPPORTED_LANG_MODEL_DICT:
SUPPORTED_LANG_MODEL_DICT[lang] = [model_id]
else:
SUPPORTED_LANG_MODEL_DICT[lang].append(model_id)
# Sort model names
for lang in SUPPORTED_LANG_MODEL_DICT.keys():
model_ids = SUPPORTED_LANG_MODEL_DICT[lang]
model_ids = sorted(model_ids)
SUPPORTED_LANG_MODEL_DICT[lang] = model_ids
def transcribe(microphone, audio_file, model_name):
model = model_dict[model_name]
warn_output = ""
if (microphone is not None) and (audio_file is not None):
warn_output = (
"WARNING: You've uploaded an audio file and used the microphone. "
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
)
audio_data = microphone
elif (microphone is None) and (audio_file is None):
return "ERROR: You have to either use the microphone or upload an audio file"
elif microphone is not None:
audio_data = microphone
else:
audio_data = audio_file
try:
# Use HF API for transcription
transcriptions = model(audio_data)
except Exception as e:
transcriptions = ""
warn_output = warn_output + "\n\n"
warn_output += (
f"The model `{model_name}` is currently loading and cannot be used "
f"for transcription.\n"
f"Please try another model or wait a few minutes."
)
return warn_output + transcriptions
demo = gr.Blocks(title=TITLE, css=CSS)
with demo:
header = gr.Markdown(MARKDOWN)
with gr.Row() as row:
file_upload = gr.components.Audio(source="upload", type='filepath', label='Upload File')
microphone = gr.components.Audio(source="microphone", type='filepath', label='Microphone')
lang_selector = gr.components.Dropdown(
choices=sorted(list(SUPPORTED_LANGUAGES)), value="en", type="value", label="Languages", interactive=True,
)
models_in_lang = gr.components.Dropdown(
choices=sorted(list(SUPPORTED_LANG_MODEL_DICT["en"])),
value=DEFAULT_EN_MODEL,
label="Models",
interactive=True,
)
def update_models_with_lang(lang):
models_names = sorted(list(SUPPORTED_LANG_MODEL_DICT[lang]))
default = models_names[0]
if lang == 'en':
default = DEFAULT_EN_MODEL
return models_in_lang.update(choices=models_names, value=default)
lang_selector.change(update_models_with_lang, inputs=[lang_selector], outputs=[models_in_lang])
transcript = gr.components.Label(label='Transcript')
run = gr.components.Button('Transcribe')
run.click(transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript])
gr.components.HTML(ARTICLE)
demo.queue(concurrency_count=1)
demo.launch(share=True)
|