smajumdar commited on
Commit
7a5ebea
1 Parent(s): 6119c80

Initial commit

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. app.py +64 -0
  3. packages.txt +2 -0
  4. requirements.txt +1 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Nemo_conformer_rnnt_large_streaming
3
  emoji: 🐠
4
- colorFrom: pink
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 2.9.0
8
  app_file: app.py
 
1
  ---
2
  title: Nemo_conformer_rnnt_large_streaming
3
  emoji: 🐠
4
+ colorFrom: blue
5
+ colorTo: white
6
  sdk: gradio
7
  sdk_version: 2.9.0
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import librosa
4
+ import soundfile
5
+ import nemo.collections.asr as nemo_asr
6
+ import tempfile
7
+ import os
8
+ import uuid
9
+
10
+ SAMPLE_RATE = 16000
11
+
12
+ model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("stt_en_conformer_transducer_large")
13
+ model.change_decoding_strategy(None)
14
+ model.eval()
15
+
16
+
17
+ def process_audio_file(file):
18
+ data, sr = librosa.load(file)
19
+
20
+ if sr != SAMPLE_RATE:
21
+ data = librosa.resample(data, sr, SAMPLE_RATE)
22
+
23
+ # monochannel
24
+ data = librosa.to_mono(data)
25
+ return data
26
+
27
+
28
+ def transcribe(Audio, state=""):
29
+ audio_data = process_audio_file(Audio)
30
+
31
+ with tempfile.TemporaryDirectory() as tmpdir:
32
+ audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
33
+ soundfile.write(audio_path, audio_data, SAMPLE_RATE)
34
+
35
+ transcriptions = model.transcribe([audio_path])
36
+
37
+ # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
38
+ if type(transcriptions) == tuple and len(transcriptions) == 2:
39
+ transcriptions = transcriptions[0]
40
+
41
+ transcriptions = transcriptions[0]
42
+
43
+ state = state + transcriptions + " "
44
+ return state, state
45
+
46
+
47
+ iface = gr.Interface(
48
+ fn=transcribe,
49
+ inputs=[
50
+ gr.inputs.Audio(source="microphone", type='filepath'),
51
+ "state",
52
+ ],
53
+ outputs=[
54
+ "textbox",
55
+ "state",
56
+ ],
57
+ layout="horizontal",
58
+ theme="huggingface",
59
+ title="NeMo Streaming Conformer Transducer Large - English",
60
+ description="Demo for English speech recognition using Conformer Transducers",
61
+ allow_flagging='never',
62
+ live=True,
63
+ )
64
+ iface.launch(enable_queue=True)
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ libsndfile1
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ nemo_toolkit[asr]