import gradio as gr import torch import time import librosa import numpy as np import soundfile import nemo.collections.asr as nemo_asr import tempfile import os import uuid SAMPLE_RATE = 16000 model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_en_conformer_transducer_xlarge") model.change_decoding_strategy(None) model.eval() # def process_audio_file(file): def process_audio_file(data, sr): # data, sr = librosa.load(file) if sr != SAMPLE_RATE: data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE) # monochannel data = librosa.to_mono(data) return data def transcribe(state, audio): # Grant additional context # time.sleep(1) sr, audio = audio audio = audio.astype(np.float32) audio /= np.max(np.abs(audio)) if state is None: state = "" # state = audio audio_data = process_audio_file(audio, sr) with tempfile.TemporaryDirectory() as tmpdir: # Filepath transcribe audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav') soundfile.write(audio_path, audio_data, SAMPLE_RATE) transcriptions = model.transcribe([audio_path]) # Direct transcribe # transcriptions = model.transcribe([audio]) # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis if type(transcriptions) == tuple and len(transcriptions) == 2: transcriptions = transcriptions[0] transcriptions = transcriptions[0] state = state + transcriptions + " " return state, state iface = gr.Interface( fn=transcribe, inputs=[ "state", gr.Audio(source="microphone", streaming=True), ], outputs=[ "state", "textbox", ], title="NeMo Streaming Conformer Transducer Large - English", description="Demo for English speech recognition using Conformer Transducers", live=True, ) # hack to prevent flickering of output # iface.dependencies[0]["show_progress"] = False # iface.dependencies[1]["show_progress"] = False # iface.dependencies[2]["show_progress"] = False iface.launch()