File size: 2,407 Bytes
f5460b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import pyaudio
import numpy as np
import webrtcvad

# Set up PyAudio
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 48000
CHUNK_SIZE = 960  # 20ms audio chunks
# p = pyaudio.PyAudio()

# wav = "/home/kevingeng/Disk2/laronix/Laronix_ASR_TTS_VC/wav/20221228_video_good_normed_5/take1_001_norm.wav"
wav = "/home/kevingeng/Disk2/laronix/Laronix_ASR_TTS_VC/wav/VAD_test.wav"
import wave
wf = wave.open(wav, "rb")
# import pdb
# stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
#                 channels=wf.getnchannels(),
#                 rate=wf.getframerate(),
#                 output=True)
# pdb.set_trace()
# Set up VAD

def streaming_VAD(wf):
    vad = webrtcvad.Vad()
    vad.set_mode(2)  # Aggressive mode

    # Start audio stream
    # stream = p.open(format=FORMAT,
    #                 channels=CHANNELS,
    #                 rate=RATE,
    #                 input=True,
    #                 frames_per_buffer=CHUNK_SIZE)

    # VAD constants
    MIN_SILENCE_DURATION = 2000  # in ms
    MAX_SILENCE_DURATION = 4000  # in ms
    BUFFER_SIZE = MAX_SILENCE_DURATION // CHUNK_SIZE
    BUFFER_THRESHOLD = int(BUFFER_SIZE * 0.5)

    # Initialize VAD buffer
    vad_buffer = []
    VAD_indicator = []
    VAD_frame_indicator = []
    data = wf.readframes(CHUNK_SIZE)
    # Loop through audio stream
    while data:
        # Read audio chunk from stream
        # pdb.set_trace()
        # audio_chunk = np.frombuffer(stream.read(CHUNK_SIZE), dtype=np.int16)
        audio_chunk = np.frombuffer(data, dtype=np.int16)
        # Detect voice activity
        # is_speech = vad.is_speech(audio_chunk.tobytes(), RATE)
        try:
            is_speech = vad.is_speech(audio_chunk, RATE)
        except:
            is_speech = False
        vad_buffer.append(is_speech)
        
        # If VAD buffer is full, check for silence and reset buffer
        if len(vad_buffer) == BUFFER_SIZE:
            # Check if buffer contains mostly silence
            if vad_buffer.count(False) >= BUFFER_THRESHOLD:
                # print("Slience")
                # VAD_indicator.append(0)
                # vad_buffer = []
                return(False)
            else:
                # print("Voice detected!")
                # VAD_indicator.append(1)
                vad_buffer = vad_buffer[CHUNK_SIZE // BUFFER_SIZE:]
                return(True)
        data = wf.readframes(CHUNK_SIZE)