nguyenvulebinh
/

heyarmar

+```python
+"""the interface to interact with wakeword model"""
+import pyaudio
+import threading
+import time
+import torchaudio
+import torch
+import numpy as np
+import queue
+from transformers import WavLMForSequenceClassification
+from transformers import AutoFeatureExtractor
+def int2float(sound):
+    abs_max = np.abs(sound).max()
+    sound = sound.astype('float32')
+    if abs_max > 0:
+        sound *= 1/abs_max
+    sound = sound.squeeze()  # depends on the use case
+    return sound
+class RealtimeDecoder():
+    def __init__(self,
+        model,
+    ) -> None:
+        self.model = model
+        self.vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                              model='silero_vad',
+                              force_reload=False,
+                              onnx=False)
+        (self.get_speech_timestamps, _, _, _, _) = utils
+        self.SAMPLE_RATE = 16000
+        self.cache_output = {
+            "cache" : torch.zeros(0, 0, 0, dtype=torch.float),
+            "wavchunks": [],
+        }
+        self.continue_recording = threading.Event()
+        self.frame_duration_ms = 1000
+        self.audio_queue = queue.SimpleQueue()
+        self.speech_queue = queue.SimpleQueue()
+    def start_recording(self, wait_enter_to_stop=True):
+        def stop():
+            input("Press Enter to stop the recording:\n\n")
+            self.continue_recording.set()
+        def record():
+            audio = pyaudio.PyAudio()
+            stream = audio.open(format=pyaudio.paInt16,
+                    channels=1,
+                    rate=self.SAMPLE_RATE,
+                    input=True,
+                    frames_per_buffer=int(self.SAMPLE_RATE / 10))
+            while not self.continue_recording.is_set():
+                audio_chunk = stream.read(int(self.SAMPLE_RATE * self.frame_duration_ms / 1000.0), exception_on_overflow = False)
+                audio_int16 = np.frombuffer(audio_chunk, np.int16)
+                audio_float32 = int2float(audio_int16)
+                waveform = torch.from_numpy(audio_float32)
+                self.audio_queue.put(waveform)
+            print("Finish record")
+            stream.close()
+        if wait_enter_to_stop:
+            stop_listener_thread = threading.Thread(target=stop, daemon=False)
+        else:
+            stop_listener_thread = None
+        recording_thread = threading.Thread(target=record, daemon=False)
+        return stop_listener_thread, recording_thread
+    def finish_realtime_decode(self):
+        self.cache_output = {
+            "cache" : torch.zeros(0, 0, 0, dtype=torch.float),
+            "wavchunks": [],
+        }
+    def start_decoding(self):
+        def decode():
+            while not self.continue_recording.is_set():
+                if self.audio_queue.qsize() > 0:
+                    currunt_wavform = self.audio_queue.get()
+                    if currunt_wavform is not None:
+                        self.cache_output['wavchunks'].append(currunt_wavform)
+                        self.cache_output['wavchunks'] = self.cache_output['wavchunks'][-4:]
+                    if len(self.cache_output['wavchunks']) > 1:
+                        wavform = torch.cat(self.cache_output['wavchunks'][-2:], dim=-1)
+                        speech_timestamps = self.get_speech_timestamps(wavform, self.vad_model, sampling_rate=self.SAMPLE_RATE)
+                        logits = [1, 0]
+                        if len(speech_timestamps) > 0:
+                            input_features = feature_extractor.pad([{"input_values": wavform}], padding=True, return_tensors="pt")
+                            logits = self.model(**input_features).logits.softmax(dim=-1).squeeze()
+                        if logits[1] > 0.6:
+                            print("hey armar", logits, wavform.size(-1) / self.SAMPLE_RATE)
+                            self.cache_output['wavchunks'] = []
+                        else:
+                            print('.'+'.'*self.audio_queue.qsize())
+                else:
+                    time.sleep(0.01)
+            print("KWS thread finish")
+        kws_decode_thread = threading.Thread(target=decode, daemon=False)
+        return kws_decode_thread
+if __name__ == "__main__":
+    print("Model loading....")
+    kws_model = WavLMForSequenceClassification.from_pretrained('nguyenvulebinh/heyarmar')
+    feature_extractor = AutoFeatureExtractor.from_pretrained('nguyenvulebinh/heyarmar')
+    print("Model loaded....")
+    # file_wave = './99.wav'
+    # wav, rate = torchaudio.load(file_wave)
+    # input_features = feature_extractor.pad([{"input_values": item} for item in wav], padding=True, return_tensors="pt")
+    # output = kws_model(**input_features)
+    # print(output.logits.softmax(dim=-1))
+    obj_decode = RealtimeDecoder(kws_model)
+    recording_threads = obj_decode.start_recording()
+    kws_decode_thread = obj_decode.start_decoding()
+    for thread in recording_threads:
+        if thread is not None:
+            thread.start()
+    kws_decode_thread.start()
+    for thread in recording_threads:
+        if thread is not None:
+            thread.join()
+    kws_decode_thread.join()
+```