|
import pandas as pd |
|
import numpy as np |
|
|
|
import librosa |
|
|
|
import sklearn |
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder |
|
from sklearn.model_selection import train_test_split |
|
|
|
import tensorflow as tf |
|
from keras.models import load_model |
|
|
|
import pickle |
|
|
|
sample_rate = 22050 |
|
|
|
def noise(data): |
|
noise_value = 0.015 * np.random.uniform() * np.amax(data) |
|
data = data + noise_value * np.random.normal(size=data.shape[0]) |
|
return data |
|
|
|
def stretch(data, rate=0.8): |
|
return librosa.effects.time_stretch(data, rate=rate) |
|
|
|
def shift(data): |
|
shift_range = int(np.random.uniform(low=-5, high=5) * 1000) |
|
return np.roll(data, shift_range) |
|
|
|
def pitch(data,sampling_rate,pitch_factor=0.7): |
|
return librosa.effects.pitch_shift(data,sr=sampling_rate, n_steps=pitch_factor) |
|
|
|
def extract_process(data): |
|
|
|
sample_rate = 22050 |
|
output_result = np.array([]) |
|
mean_zero = np.mean(librosa.feature.zero_crossing_rate(y=data).T,axis=0) |
|
output_result = np.hstack((output_result,mean_zero)) |
|
|
|
stft_out = np.abs(librosa.stft(data)) |
|
chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft_out,sr=sample_rate).T,axis=0) |
|
output_result = np.hstack((output_result,chroma_stft)) |
|
|
|
mfcc_out = np.mean(librosa.feature.mfcc(y=data,sr=sample_rate).T,axis=0) |
|
output_result = np.hstack((output_result,mfcc_out)) |
|
|
|
root_mean_out = np.mean(librosa.feature.rms(y=data).T,axis=0) |
|
output_result = np.hstack((output_result,root_mean_out)) |
|
|
|
mel_spectogram = np.mean(librosa.feature.melspectrogram(y=data,sr=sample_rate).T,axis=0) |
|
output_result = np.hstack((output_result,mel_spectogram)) |
|
|
|
return output_result |
|
|
|
def export_process(path): |
|
|
|
data,sample_rate = librosa.load(path,duration=2.5,offset=1) |
|
|
|
output_1 = extract_process(data) |
|
result = np.array(output_1) |
|
|
|
noise_out = noise(data) |
|
output_2 = extract_process(noise_out) |
|
result = np.vstack((result,output_2)) |
|
|
|
new_out = stretch(data) |
|
strectch_pitch = pitch(new_out,sample_rate) |
|
output_3 = extract_process(strectch_pitch) |
|
result = np.vstack((result,output_3)) |
|
|
|
return result |
|
|
|
|
|
with open('X_train.pkl', 'rb') as f: |
|
X_train = pickle.load(f) |
|
|
|
|
|
with open('Y_train.pkl', 'rb') as f: |
|
Y_train = pickle.load(f) |
|
|
|
Features = pd.DataFrame(X_train) |
|
Features['labels'] = Y_train |
|
|
|
X = Features.iloc[: ,:-1].values |
|
Y = Features['labels'].values |
|
|
|
encoder_label = OneHotEncoder() |
|
Y = encoder_label.fit_transform(np.array(Y).reshape(-1,1)).toarray() |
|
|
|
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.9, random_state=42, shuffle=True) |
|
|
|
scaler_data = StandardScaler() |
|
x_train = scaler_data.fit_transform(x_train) |
|
x_test = scaler_data.transform(x_test) |
|
|
|
def preprocess_audio(audio): |
|
|
|
features = export_process(audio) |
|
features = scaler_data.transform(features) |
|
return np.expand_dims(features, axis=2) |
|
|
|
|
|
def predict_emotion(preprocessed_audio): |
|
model = load_model('speech-emotion-recognition.hdf5') |
|
prediction = model.predict(preprocessed_audio) |
|
predicted_emotion = encoder_label.inverse_transform(prediction) |
|
return predicted_emotion[0] |
|
|
|
|
|
def live_emotion_recognition(audio_path): |
|
|
|
preprocessed_audio = preprocess_audio(audio_path) |
|
|
|
predicted_emotion = predict_emotion(preprocessed_audio) |
|
|
|
return predicted_emotion[0] |