import gradio as gr import torch import sounddevice as sd import numpy as np from nemo.collections.asr.models import ASRModel # Load the NeMo ASR model model = ASRModel.from_pretrained("nvidia/canary-1b") model.eval() # Load the keyword spotting model kws_model = torch.hub.load('snakers4/silero-vad', 'silero_vad') # Constants TRIGGER_WORD = "hey alexa" TRIGGER_DURATION = 2 # Duration to record after trigger word is detected, in seconds SAMPLE_RATE = 16000 # Sample rate for recording def start_recording(): print("Recording started...") audio = sd.rec(int(TRIGGER_DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype='float32') sd.wait() return audio.flatten() def detect_trigger(audio): # Perform keyword spotting is_triggered = kws_model(audio, sample_rate=SAMPLE_RATE) >= 0.5 return is_triggered def transcribe_triggered(): while True: print("Listening for trigger word...") # Start recording recorded_audio = start_recording() # Check if trigger word is detected is_triggered = detect_trigger(recorded_audio) if is_triggered: print("Trigger word detected. Transcribing...") # Perform speech recognition transcription = model.transcribe([recorded_audio]) return transcription[0] iface = gr.Interface(transcribe_triggered, gr.inputs.NoInput(), "text", title="ASR with NeMo Canary Model (Triggered by 'Hey Alexa')") iface.launch()