Speech-Emotion-Detector / extract_features.py
workspace's picture
Create extract_features.py
380246e
import numpy as np
import soundfile
import librosa
def extract_feature(file_name, **kwargs):
chroma = kwargs.get("chroma")
contrast = kwargs.get("contrast")
mfcc = kwargs.get("mfcc")
mel = kwargs.get("mel")
tonnetz = kwargs.get("tonnetz")
with soundfile.SoundFile(file_name) as audio_clip:
X = audio_clip.read(dtype="float32")
sound_fourier = np.abs(librosa.stft(X)) # Conducting short time fourier transform of audio clip
result = np.array([])
if mfcc:
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=audio_clip.samplerate, n_mfcc=40).T, axis=0)
result = np.hstack((result, mfccs))
if chroma:
chroma = np.mean(librosa.feature.chroma_stft(S=sound_fourier, sr=audio_clip.samplerate).T, axis=0)
result = np.hstack((result, chroma))
if mel:
mel = np.mean(librosa.feature.melspectrogram(X, sr=audio_clip.samplerate).T, axis=0)
result = np.hstack((result, mel))
if contrast:
contrast = np.mean(librosa.feature.spectral_contrast(S=sound_fourier, sr=audio_clip.samplerate).T, axis=0)
result = np.hstack((result, contrast))
if tonnetz:
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=audio_clip.samplerate).T, axis=0)
result = np.hstack((result, tonnetz))
return result