Speech-Analyser / app.py
spookyspaghetti's picture
Create app.py
0432ec8
raw
history blame
1.53 kB
!pip install -U git+https://github.com/PrithivirajDamodaran/Gramformer.git
!pip install gradio -q
## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2
## Install NeMo
BRANCH = 'r1.13.0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]
## Grab the config we'll use in this example
!mkdir configs
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml
!python -m spacy download en_core_web_md
!python -m spacy link en_core_web_md en
import gradio as gr
import time
from nemo.collections.asr.models import ASRModel
import torch
if torch.cuda.is_available():
device = torch.device(f'cuda:0')
asr_model = ASRModel.from_pretrained(model_name='stt_en_citrinet_1024')
from gramformer import Gramformer
import torch
def set_seed(seed):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
set_seed(1212)
gf = Gramformer(models = 1, use_gpu=False) # 1=corrector, 2=detector
def transcribe(audio):
"""Speech to text using Nvidia Nemo"""
text = asr_model.transcribe(paths2audio_files=[audio])[0]
correct = list(gf.correct(text, max_candidates = 1))[0]
return text, correct
# we need input, output and interface components for gradio
gr.Interface(
fn=transcribe,
inputs=[
gr.components.Audio(type="filepath"),
],
outputs=[
"textbox",
"textbox"
]).launch()