Spaces:

smajumdar
/

nemo_conformer_rnnt_large_streaming

Runtime error

App Files Files Community

nemo_conformer_rnnt_large_streaming / app.py

smajumdar

Update app.py

e69d5a9 11 months ago

raw

history blame contribute delete

No virus

2.15 kB

	import gradio as gr
	import torch
	import time
	import librosa
	import numpy as np
	import soundfile
	import nemo.collections.asr as nemo_asr
	import tempfile
	import os
	import uuid

	SAMPLE_RATE = 16000

	model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_en_conformer_transducer_xlarge")
	model.change_decoding_strategy(None)
	model.eval()


	# def process_audio_file(file):
	def process_audio_file(data, sr):
	# data, sr = librosa.load(file)

	if sr != SAMPLE_RATE:
	data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)

	# monochannel
	data = librosa.to_mono(data)
	return data


	def transcribe(state, audio):
	# Grant additional context
	# time.sleep(1)

	sr, audio = audio
	audio = audio.astype(np.float32)
	audio /= np.max(np.abs(audio))

	if state is None:
	state = ""
	# state = audio

	audio_data = process_audio_file(audio, sr)

	with tempfile.TemporaryDirectory() as tmpdir:
	# Filepath transcribe
	audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
	soundfile.write(audio_path, audio_data, SAMPLE_RATE)
	transcriptions = model.transcribe([audio_path])

	# Direct transcribe
	# transcriptions = model.transcribe([audio])

	# if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
	if type(transcriptions) == tuple and len(transcriptions) == 2:
	transcriptions = transcriptions[0]

	transcriptions = transcriptions[0]

	state = state + transcriptions + " "
	return state, state


	iface = gr.Interface(
	fn=transcribe,
	inputs=[
	"state",
	gr.Audio(source="microphone", streaming=True),
	],
	outputs=[
	"state",
	"textbox",
	],
	title="NeMo Streaming Conformer Transducer Large - English",
	description="Demo for English speech recognition using Conformer Transducers",
	live=True,
	)

	# hack to prevent flickering of output
	# iface.dependencies[0]["show_progress"] = False
	# iface.dependencies[1]["show_progress"] = False
	# iface.dependencies[2]["show_progress"] = False

	iface.launch()