Spaces:
Runtime error
Runtime error
File size: 2,147 Bytes
7a5ebea b2df314 7a5ebea 737d6ed 7a5ebea e0026de 7a5ebea 737d6ed 7a5ebea 1fb0d02 7a5ebea dba7e08 7c44de0 0d76512 7c44de0 737d6ed e69d5a9 b2df314 737d6ed 7a5ebea e954933 7e9b470 e954933 7e9b470 7a5ebea e69d5a9 7a5ebea 737d6ed 7a5ebea dba7e08 7a5ebea b26d31a 6f41a53 b26d31a b2df314 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import gradio as gr
import torch
import time
import librosa
import numpy as np
import soundfile
import nemo.collections.asr as nemo_asr
import tempfile
import os
import uuid
SAMPLE_RATE = 16000
model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_en_conformer_transducer_xlarge")
model.change_decoding_strategy(None)
model.eval()
# def process_audio_file(file):
def process_audio_file(data, sr):
# data, sr = librosa.load(file)
if sr != SAMPLE_RATE:
data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
# monochannel
data = librosa.to_mono(data)
return data
def transcribe(state, audio):
# Grant additional context
# time.sleep(1)
sr, audio = audio
audio = audio.astype(np.float32)
audio /= np.max(np.abs(audio))
if state is None:
state = ""
# state = audio
audio_data = process_audio_file(audio, sr)
with tempfile.TemporaryDirectory() as tmpdir:
# Filepath transcribe
audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
soundfile.write(audio_path, audio_data, SAMPLE_RATE)
transcriptions = model.transcribe([audio_path])
# Direct transcribe
# transcriptions = model.transcribe([audio])
# if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
if type(transcriptions) == tuple and len(transcriptions) == 2:
transcriptions = transcriptions[0]
transcriptions = transcriptions[0]
state = state + transcriptions + " "
return state, state
iface = gr.Interface(
fn=transcribe,
inputs=[
"state",
gr.Audio(source="microphone", streaming=True),
],
outputs=[
"state",
"textbox",
],
title="NeMo Streaming Conformer Transducer Large - English",
description="Demo for English speech recognition using Conformer Transducers",
live=True,
)
# hack to prevent flickering of output
# iface.dependencies[0]["show_progress"] = False
# iface.dependencies[1]["show_progress"] = False
# iface.dependencies[2]["show_progress"] = False
iface.launch()
|