File size: 1,521 Bytes
936f6fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import numpy as np
from srmrpy.segmentaxis import segment_axis

def simple_energy_vad(x, fs, framelen=0.02, theta_main=30, theta_min=-55):
    '''Simple energy voice activity detection algorithm based on energy
    thresholds as described in Tomi Kinnunen and Padmanabhan Rajan, "A
    practical, self-adaptive voice activity detector for speaker verification
    with noisy telephone and microphone data", ICASSP 2013, Vancouver (NOTE:
    this is the benchmark method, not the method proposed by the authors).
    '''
    # Split signal in frames
    framelen = int(framelen * fs)
    frames = segment_axis(x, length=framelen, overlap=0, end='pad')
    frames_zero_mean = frames - frames.mean(axis=0)
    frame_energy = 10*np.log10(1/(framelen-1) * (frames_zero_mean**2).sum(axis=1) + 1e-6)
    max_energy = max(frame_energy)
    speech_presence = (frame_energy > max_energy - theta_main) & (frame_energy > theta_min)
    x_vad = np.zeros_like(x, dtype=bool)
    for idx, frame in enumerate(frames):
        if speech_presence[idx]:
            x_vad[idx*framelen:(idx+1)*framelen] = True
        else:
            x_vad[idx*framelen:(idx+1)*framelen] = False
    return x[x_vad], x_vad

if __name__ == '__main__':
    import sys
    from scipy.io.wavfile import read as readwav
    from matplotlib import pyplot as plt

    fs, s = readwav(sys.argv[1])
    s  = s.astype('float')/np.iinfo(s.dtype).max
    s_vad, speech_presence = simple_energy_vad(s, fs)

    plt.plot(s)
    plt.plot(s_vad - 1, 'g')
    plt.show()