|
|
| from .base import MetaItem, BasePipe |
| from ..helpers.vadprocessor import FixedVADIterator |
|
|
| import numpy as np |
| import logging |
|
|
| |
|
|
|
|
| class VadPipe(BasePipe): |
| vac = None |
| sample_rate = 16000 |
|
|
| def __init__(self, in_queue=None, out_queue=None) -> None: |
| super().__init__(in_queue, out_queue) |
| self._offset = 0 |
| self._status = 'END' |
|
|
|
|
| def reset(self): |
| self._offset = 0 |
| self._status = 'END' |
|
|
| self.vac.reset_states() |
|
|
| @classmethod |
| def init(cls): |
| if cls.vac is None: |
| cls.vac = FixedVADIterator( |
| threshold=0.6, |
| sampling_rate=cls.sample_rate, |
| |
| min_silence_duration_ms = 100, |
| |
| ) |
| cls.vac.reset_states() |
|
|
|
|
| |
| |
|
|
| def _process_speech_chunk(self, source_audio:np.ndarray): |
| speech_dict = self.vac(source_audio, return_seconds=False) |
| |
| if speech_dict: |
| relative_start_frame = None |
| relative_end_frame = None |
| start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end") |
| if start_frame: |
| relative_start_frame =start_frame - self._offset |
| if end_frame: |
| relative_end_frame = end_frame - self._offset |
| return relative_start_frame, relative_end_frame |
|
|
| def process(self, in_data: MetaItem) -> MetaItem: |
| if self._offset == 0: |
| self.vac.reset_states() |
|
|
| |
| source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32) |
| print(f"source_audio.shape = {source_audio.shape}") |
| speech_data = self._process_speech_chunk(source_audio) |
|
|
|
|
| if speech_data: |
| rel_start_frame, rel_end_frame = speech_data |
|
|
| if rel_start_frame is not None and rel_end_frame is None: |
| self._status = "START" |
| target_audio = source_audio[max(rel_start_frame-100, 0):] |
| logging.debug("🫸 Speech start frame: {}".format(rel_start_frame)) |
| elif rel_start_frame is None and rel_end_frame is not None: |
| self._status = "END" |
| target_audio = source_audio[:rel_end_frame] |
| logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame)) |
| else: |
| self._status = 'END' |
| target_audio = source_audio[max(rel_start_frame-100, 0):rel_end_frame] |
| logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame)) |
| |
| else: |
| if self._status == 'START': |
| target_audio = source_audio |
| |
| else: |
| target_audio = np.array([],dtype=np.float32) |
| |
| |
| print(f"strat: {rel_start_frame} end: {rel_end_frame}") |
| self._offset += len(source_audio) |
|
|
| in_data.audio = target_audio.tobytes() |
| in_data.source_audio = b'' |
| in_data.speech_status = self._status |
| return in_data |
|
|