File size: 1,529 Bytes
0432ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
!pip install -U git+https://github.com/PrithivirajDamodaran/Gramformer.git
!pip install gradio -q
## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2

## Install NeMo
BRANCH = 'r1.13.0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

## Grab the config we'll use in this example
!mkdir configs
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml

!python -m spacy download en_core_web_md
!python -m spacy link en_core_web_md en

import gradio as gr
import time
from nemo.collections.asr.models import ASRModel
import torch
if torch.cuda.is_available():
    device = torch.device(f'cuda:0')
asr_model = ASRModel.from_pretrained(model_name='stt_en_citrinet_1024')

from gramformer import Gramformer
import torch

def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(1212)

gf = Gramformer(models = 1, use_gpu=False) # 1=corrector, 2=detector

def transcribe(audio):
  """Speech to text using Nvidia Nemo"""
  text = asr_model.transcribe(paths2audio_files=[audio])[0]
  correct = list(gf.correct(text, max_candidates = 1))[0]
  return text, correct

# we need input, output and interface components for gradio
gr.Interface(
    fn=transcribe,
    inputs=[
        gr.components.Audio(type="filepath"),
    ],
    outputs=[
        "textbox",
        "textbox"
    ]).launch()