jiuuee commited on
Commit
4ac65aa
1 Parent(s): e94cc77

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -23
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import gradio as gr
2
  import torch
 
 
3
  from nemo.collections.asr.models import ASRModel
4
 
5
  # Load the NeMo ASR model
@@ -9,30 +11,35 @@ model.eval()
9
  # Load the keyword spotting model
10
  kws_model = torch.hub.load('snakers4/silero-vad', 'silero_vad')
11
 
12
- def detect_trigger(audio):
13
- if audio is None:
14
- raise gr.InterfaceError("Please provide some input audio: either upload an audio file or use the microphone")
 
15
 
16
- # Perform keyword spotting
17
- is_triggered = kws_model(audio) # You need to adapt this line to the actual API of your keyword spotting model
 
 
 
18
 
 
 
 
19
  return is_triggered
20
 
21
- def transcribe_triggered(audio):
22
- if audio is None:
23
- raise gr.InterfaceError("Please provide some input audio: either upload an audio file or use the microphone")
24
-
25
- # Check if trigger word is detected
26
- is_triggered = detect_trigger(audio)
27
- if not is_triggered:
28
- return "Trigger word not detected."
29
-
30
- # Perform speech recognition
31
- transcription = model.transcribe([audio])
32
-
33
- return transcription[0]
34
-
35
- audio_input = gr.components.Audio()
36
-
37
- iface = gr.Interface(transcribe_triggered, audio_input, "text", title="ASR with NeMo Canary Model (Triggered by 'Hey Alexa')")
38
- iface.launch()
 
1
  import gradio as gr
2
  import torch
3
+ import sounddevice as sd
4
+ import numpy as np
5
  from nemo.collections.asr.models import ASRModel
6
 
7
  # Load the NeMo ASR model
 
11
  # Load the keyword spotting model
12
  kws_model = torch.hub.load('snakers4/silero-vad', 'silero_vad')
13
 
14
+ # Constants
15
+ TRIGGER_WORD = "hey alexa"
16
+ TRIGGER_DURATION = 2 # Duration to record after trigger word is detected, in seconds
17
+ SAMPLE_RATE = 16000 # Sample rate for recording
18
 
19
+ def start_recording():
20
+ print("Recording started...")
21
+ audio = sd.rec(int(TRIGGER_DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype='float32')
22
+ sd.wait()
23
+ return audio.flatten()
24
 
25
+ def detect_trigger(audio):
26
+ # Perform keyword spotting
27
+ is_triggered = kws_model(audio, sample_rate=SAMPLE_RATE) >= 0.5
28
  return is_triggered
29
 
30
+ def transcribe_triggered():
31
+ while True:
32
+ print("Listening for trigger word...")
33
+ # Start recording
34
+ recorded_audio = start_recording()
35
+
36
+ # Check if trigger word is detected
37
+ is_triggered = detect_trigger(recorded_audio)
38
+ if is_triggered:
39
+ print("Trigger word detected. Transcribing...")
40
+ # Perform speech recognition
41
+ transcription = model.transcribe([recorded_audio])
42
+ return transcription[0]
43
+
44
+ iface = gr.Interface(transcribe_triggered, gr.inputs.NoInput(), "text", title="ASR with NeMo Canary Model (Triggered by 'Hey Alexa')")
45
+ iface.launch()