import gradio as gr import torch import nemo.collections.asr as nemo_asr SAMPLE_RATE = 16000 TITLE = "NeMo ASR Inference on Hugging Face" DESCRIPTION = "Demo of all languages supported by NeMo ASR" DEFAULT_EN_MODEL = "nvidia/stt_en_conformer_transducer_xlarge" MARKDOWN = f""" # {TITLE} ## {DESCRIPTION} """ CSS = """ p.big { font-size: 20px; } """ ARTICLE = """

NeMo ASR | Github Repo

""" SUPPORTED_LANGUAGES = set([]) SUPPORTED_MODEL_NAMES = set([]) # HF models hf_filter = nemo_asr.models.ASRModel.get_hf_model_filter() hf_filter.task = "automatic-speech-recognition" hf_infos = nemo_asr.models.ASRModel.search_huggingface_models(model_filter=hf_filter) for info in hf_infos: lang_id = info.modelId.split("_")[1] # obtains lang id as str SUPPORTED_LANGUAGES.add(lang_id) SUPPORTED_MODEL_NAMES.add(info.modelId) SUPPORTED_MODEL_NAMES = sorted(list(SUPPORTED_MODEL_NAMES)) model_dict = {model_name: gr.Interface.load(f'models/{model_name}') for model_name in SUPPORTED_MODEL_NAMES} SUPPORTED_LANG_MODEL_DICT = {} for lang in SUPPORTED_LANGUAGES: for model_id in SUPPORTED_MODEL_NAMES: if ("_" + lang + "_") in model_id: # create new lang in dict if lang not in SUPPORTED_LANG_MODEL_DICT: SUPPORTED_LANG_MODEL_DICT[lang] = [model_id] else: SUPPORTED_LANG_MODEL_DICT[lang].append(model_id) # Sort model names for lang in SUPPORTED_LANG_MODEL_DICT.keys(): model_ids = SUPPORTED_LANG_MODEL_DICT[lang] model_ids = sorted(model_ids) SUPPORTED_LANG_MODEL_DICT[lang] = model_ids def transcribe(microphone, audio_file, model_name): model = model_dict[model_name] warn_output = "" if (microphone is not None) and (audio_file is not None): warn_output = ( "WARNING: You've uploaded an audio file and used the microphone. " "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" ) audio_data = microphone elif (microphone is None) and (audio_file is None): return "ERROR: You have to either use the microphone or upload an audio file" elif microphone is not None: audio_data = microphone else: audio_data = audio_file try: # Use HF API for transcription transcriptions = model(audio_data) except Exception as e: transcriptions = "" warn_output = warn_output + "\n\n" warn_output += ( f"The model `{model_name}` is currently loading and cannot be used " f"for transcription.\n" f"Please try another model or wait a few minutes." ) return warn_output + transcriptions demo = gr.Blocks(title=TITLE, css=CSS) with demo: header = gr.Markdown(MARKDOWN) with gr.Row() as row: file_upload = gr.components.Audio(source="upload", type='filepath', label='Upload File') microphone = gr.components.Audio(source="microphone", type='filepath', label='Microphone') lang_selector = gr.components.Dropdown( choices=sorted(list(SUPPORTED_LANGUAGES)), value="en", type="value", label="Languages", interactive=True, ) models_in_lang = gr.components.Dropdown( choices=sorted(list(SUPPORTED_LANG_MODEL_DICT["en"])), value=DEFAULT_EN_MODEL, label="Models", interactive=True, ) def update_models_with_lang(lang): models_names = sorted(list(SUPPORTED_LANG_MODEL_DICT[lang])) default = models_names[0] if lang == 'en': default = DEFAULT_EN_MODEL return models_in_lang.update(choices=models_names, value=default) lang_selector.change(update_models_with_lang, inputs=[lang_selector], outputs=[models_in_lang]) transcript = gr.components.Label(label='Transcript') run = gr.components.Button('Transcribe') run.click(transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript]) gr.components.HTML(ARTICLE) demo.queue(concurrency_count=1) demo.launch()