!pip install -U git+https://github.com/PrithivirajDamodaran/Gramformer.git !pip install gradio -q ## Install dependencies !pip install wget !apt-get install sox libsndfile1 ffmpeg !pip install text-unidecode !pip install matplotlib>=3.3.2 ## Install NeMo BRANCH = 'r1.13.0' !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all] ## Grab the config we'll use in this example !mkdir configs !wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml !python -m spacy download en_core_web_md !python -m spacy link en_core_web_md en import gradio as gr import time from nemo.collections.asr.models import ASRModel import torch if torch.cuda.is_available(): device = torch.device(f'cuda:0') asr_model = ASRModel.from_pretrained(model_name='stt_en_citrinet_1024') from gramformer import Gramformer import torch def set_seed(seed): torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) set_seed(1212) gf = Gramformer(models = 1, use_gpu=False) # 1=corrector, 2=detector def transcribe(audio): """Speech to text using Nvidia Nemo""" text = asr_model.transcribe(paths2audio_files=[audio])[0] correct = list(gf.correct(text, max_candidates = 1))[0] return text, correct # we need input, output and interface components for gradio gr.Interface( fn=transcribe, inputs=[ gr.components.Audio(type="filepath"), ], outputs=[ "textbox", "textbox" ]).launch()