awacke1 commited on
Commit
bfb646b
1 Parent(s): a2b8b74

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +74 -0
  2. packages.txt +2 -0
  3. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import time
4
+ import librosa
5
+ import soundfile
6
+ import nemo.collections.asr as nemo_asr
7
+ import tempfile
8
+ import os
9
+ import uuid
10
+
11
+ SAMPLE_RATE = 16000
12
+
13
+ model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_en_conformer_transducer_xlarge")
14
+ model.change_decoding_strategy(None)
15
+ model.eval()
16
+
17
+
18
+ def process_audio_file(file):
19
+ data, sr = librosa.load(file)
20
+
21
+ if sr != SAMPLE_RATE:
22
+ data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
23
+
24
+ # monochannel
25
+ data = librosa.to_mono(data)
26
+ return data
27
+
28
+
29
+ def transcribe(audio, state=""):
30
+ # Grant additional context
31
+ # time.sleep(1)
32
+
33
+ if state is None:
34
+ state = ""
35
+
36
+ audio_data = process_audio_file(audio)
37
+
38
+ with tempfile.TemporaryDirectory() as tmpdir:
39
+ # Filepath transcribe
40
+ audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
41
+ soundfile.write(audio_path, audio_data, SAMPLE_RATE)
42
+ transcriptions = model.transcribe([audio_path])
43
+
44
+ # Direct transcribe
45
+ # transcriptions = model.transcribe([audio])
46
+
47
+ # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
48
+ if type(transcriptions) == tuple and len(transcriptions) == 2:
49
+ transcriptions = transcriptions[0]
50
+
51
+ transcriptions = transcriptions[0]
52
+
53
+ state = state + transcriptions + " "
54
+ return state, state
55
+
56
+
57
+ iface = gr.Interface(
58
+ fn=transcribe,
59
+ inputs=[
60
+ gr.Audio(source="microphone", type='filepath', streaming=True),
61
+ "state",
62
+ ],
63
+ outputs=[
64
+ "textbox",
65
+ "state",
66
+ ],
67
+ layout="horizontal",
68
+ theme="huggingface",
69
+ title="NeMo Streaming Conformer Transducer Large - English",
70
+ description="Demo for English speech recognition using Conformer Transducers",
71
+ allow_flagging='never',
72
+ live=True,
73
+ )
74
+ iface.launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ libsndfile1
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ nemo_toolkit[asr]