File size: 2,147 Bytes
7a5ebea
 
b2df314
7a5ebea
737d6ed
7a5ebea
 
 
 
 
 
 
 
e0026de
7a5ebea
 
 
 
737d6ed
 
 
7a5ebea
 
1fb0d02
7a5ebea
 
 
 
 
 
dba7e08
7c44de0
0d76512
7c44de0
737d6ed
 
 
 
e69d5a9
 
 
b2df314
737d6ed
7a5ebea
 
e954933
7e9b470
 
 
e954933
 
7e9b470
7a5ebea
 
 
 
 
 
 
e69d5a9
 
7a5ebea
 
 
 
 
 
737d6ed
7a5ebea
 
 
dba7e08
7a5ebea
 
 
 
 
b26d31a
 
6f41a53
 
 
b26d31a
b2df314
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr
import torch
import time
import librosa
import numpy as np
import soundfile
import nemo.collections.asr as nemo_asr
import tempfile
import os
import uuid

SAMPLE_RATE = 16000

model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_en_conformer_transducer_xlarge")
model.change_decoding_strategy(None)
model.eval()


# def process_audio_file(file):
def process_audio_file(data, sr):
    # data, sr = librosa.load(file)

    if sr != SAMPLE_RATE:
        data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)

    # monochannel
    data = librosa.to_mono(data)
    return data


def transcribe(state, audio):
    # Grant additional context
    # time.sleep(1)

    sr, audio = audio
    audio = audio.astype(np.float32)
    audio /= np.max(np.abs(audio))
    
    if state is None:
        state = ""
    # state = audio

    audio_data = process_audio_file(audio, sr)

    with tempfile.TemporaryDirectory() as tmpdir:
        # Filepath transcribe
        audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
        soundfile.write(audio_path, audio_data, SAMPLE_RATE)
        transcriptions = model.transcribe([audio_path])
        
        # Direct transcribe
        # transcriptions = model.transcribe([audio])

        # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
        if type(transcriptions) == tuple and len(transcriptions) == 2:
            transcriptions = transcriptions[0]

        transcriptions = transcriptions[0]

    state = state + transcriptions + " "
    return state, state


iface = gr.Interface(
    fn=transcribe,
    inputs=[
        "state",
        gr.Audio(source="microphone", streaming=True),
    ],
    outputs=[
        "state",
        "textbox",
    ],
    title="NeMo Streaming Conformer Transducer Large - English",
    description="Demo for English speech recognition using Conformer Transducers",
    live=True,
)

# hack to prevent flickering of output
# iface.dependencies[0]["show_progress"] = False
# iface.dependencies[1]["show_progress"] = False
# iface.dependencies[2]["show_progress"] = False

iface.launch()