alibabasglab's picture
Upload 73 files
936f6fa verified
import numpy as np
from srmrpy.segmentaxis import segment_axis
def simple_energy_vad(x, fs, framelen=0.02, theta_main=30, theta_min=-55):
'''Simple energy voice activity detection algorithm based on energy
thresholds as described in Tomi Kinnunen and Padmanabhan Rajan, "A
practical, self-adaptive voice activity detector for speaker verification
with noisy telephone and microphone data", ICASSP 2013, Vancouver (NOTE:
this is the benchmark method, not the method proposed by the authors).
'''
# Split signal in frames
framelen = int(framelen * fs)
frames = segment_axis(x, length=framelen, overlap=0, end='pad')
frames_zero_mean = frames - frames.mean(axis=0)
frame_energy = 10*np.log10(1/(framelen-1) * (frames_zero_mean**2).sum(axis=1) + 1e-6)
max_energy = max(frame_energy)
speech_presence = (frame_energy > max_energy - theta_main) & (frame_energy > theta_min)
x_vad = np.zeros_like(x, dtype=bool)
for idx, frame in enumerate(frames):
if speech_presence[idx]:
x_vad[idx*framelen:(idx+1)*framelen] = True
else:
x_vad[idx*framelen:(idx+1)*framelen] = False
return x[x_vad], x_vad
if __name__ == '__main__':
import sys
from scipy.io.wavfile import read as readwav
from matplotlib import pyplot as plt
fs, s = readwav(sys.argv[1])
s = s.astype('float')/np.iinfo(s.dtype).max
s_vad, speech_presence = simple_energy_vad(s, fs)
plt.plot(s)
plt.plot(s_vad - 1, 'g')
plt.show()