smajumdar's picture
Update app.py
e69d5a9
raw
history blame contribute delete
No virus
2.15 kB
import gradio as gr
import torch
import time
import librosa
import numpy as np
import soundfile
import nemo.collections.asr as nemo_asr
import tempfile
import os
import uuid
SAMPLE_RATE = 16000
model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_en_conformer_transducer_xlarge")
model.change_decoding_strategy(None)
model.eval()
# def process_audio_file(file):
def process_audio_file(data, sr):
# data, sr = librosa.load(file)
if sr != SAMPLE_RATE:
data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
# monochannel
data = librosa.to_mono(data)
return data
def transcribe(state, audio):
# Grant additional context
# time.sleep(1)
sr, audio = audio
audio = audio.astype(np.float32)
audio /= np.max(np.abs(audio))
if state is None:
state = ""
# state = audio
audio_data = process_audio_file(audio, sr)
with tempfile.TemporaryDirectory() as tmpdir:
# Filepath transcribe
audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
soundfile.write(audio_path, audio_data, SAMPLE_RATE)
transcriptions = model.transcribe([audio_path])
# Direct transcribe
# transcriptions = model.transcribe([audio])
# if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
if type(transcriptions) == tuple and len(transcriptions) == 2:
transcriptions = transcriptions[0]
transcriptions = transcriptions[0]
state = state + transcriptions + " "
return state, state
iface = gr.Interface(
fn=transcribe,
inputs=[
"state",
gr.Audio(source="microphone", streaming=True),
],
outputs=[
"state",
"textbox",
],
title="NeMo Streaming Conformer Transducer Large - English",
description="Demo for English speech recognition using Conformer Transducers",
live=True,
)
# hack to prevent flickering of output
# iface.dependencies[0]["show_progress"] = False
# iface.dependencies[1]["show_progress"] = False
# iface.dependencies[2]["show_progress"] = False
iface.launch()