smajumdar commited on
Commit
75e1446
โ€ข
1 Parent(s): 729499b

Add nemo inference code

Browse files
Files changed (4) hide show
  1. README.md +1 -2
  2. app.py +71 -0
  3. packages.txt +2 -0
  4. requirements.txt +1 -0
README.md CHANGED
@@ -1,10 +1,9 @@
1
  ---
2
  title: Nemo_conformer_rnnt_large
3
- emoji: ๐Ÿ“‰
4
  colorFrom: green
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 2.8.14
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
1
  ---
2
  title: Nemo_conformer_rnnt_large
3
+ emoji: ๐Ÿ 
4
  colorFrom: green
5
  colorTo: red
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: false
9
  license: apache-2.0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import librosa
4
+ import soundfile
5
+ import nemo.collections.asr as nemo_asr
6
+ import tempfile
7
+ import os
8
+ import uuid
9
+
10
+ SAMPLE_RATE = 16000
11
+
12
+ model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("stt_en_conformer_transducer_large")
13
+ model.change_decoding_strategy(None)
14
+ model.eval()
15
+
16
+
17
+ def process_audio_file(file):
18
+ data, sr = librosa.load(file)
19
+
20
+ if sr != SAMPLE_RATE:
21
+ data = librosa.resample(data, sr, SAMPLE_RATE)
22
+
23
+ # monochannel
24
+ data = librosa.to_mono(data)
25
+ return data
26
+
27
+
28
+ def transcribe(file_mic, file_upload):
29
+ warn_output = ""
30
+ if (file_mic is not None) and (file_upload is not None):
31
+ warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
32
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
33
+ file = file_mic
34
+
35
+ elif (file_mic is None) and (file_upload is None):
36
+ return "ERROR: You have to either use the microphone or upload an audio file"
37
+
38
+ elif file_mic is not None:
39
+ file = file_mic
40
+ else:
41
+ file = file_upload
42
+
43
+ audio_data = process_audio_file(file)
44
+
45
+ with tempfile.TemporaryDirectory() as tmpdir:
46
+ audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
47
+ soundfile.write(audio_path, audio_data, SAMPLE_RATE)
48
+
49
+ transcriptions = model.transcribe([audio_path])
50
+ # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
51
+ if type(transcriptions) == tuple and len(transcriptions) == 2:
52
+ transcriptions = transcriptions[0]
53
+
54
+ return warn_output + transcriptions[0]
55
+
56
+
57
+ iface = gr.Interface(
58
+ fn=transcribe,
59
+ inputs=[
60
+ gr.inputs.Audio(source="microphone", type='filepath', optional=True),
61
+ gr.inputs.Audio(source="upload", type='filepath', optional=True),
62
+ ],
63
+ outputs="text",
64
+ layout="horizontal",
65
+ theme="huggingface",
66
+ title="NeMo Conformer Transducer Large",
67
+ description="Demo for speech recognition using Conformers",
68
+ enable_queue=True,
69
+ allow_flagging=False,
70
+ )
71
+ iface.launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ libsndfile1
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ nemo_toolkit[asr]