File size: 3,608 Bytes
9910ecc 9fa244e 9910ecc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import pandas as pd
import numpy as np
import librosa
import sklearn
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import load_model
import pickle
sample_rate = 22050
def noise(data):
noise_value = 0.015 * np.random.uniform() * np.amax(data)
data = data + noise_value * np.random.normal(size=data.shape[0])
return data
def stretch(data, rate=0.8):
return librosa.effects.time_stretch(data, rate=rate)
def shift(data):
shift_range = int(np.random.uniform(low=-5, high=5) * 1000)
return np.roll(data, shift_range)
def pitch(data,sampling_rate,pitch_factor=0.7):
return librosa.effects.pitch_shift(data,sr=sampling_rate, n_steps=pitch_factor)
def extract_process(data):
sample_rate = 22050
output_result = np.array([])
mean_zero = np.mean(librosa.feature.zero_crossing_rate(y=data).T,axis=0)
output_result = np.hstack((output_result,mean_zero))
stft_out = np.abs(librosa.stft(data))
chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft_out,sr=sample_rate).T,axis=0)
output_result = np.hstack((output_result,chroma_stft))
mfcc_out = np.mean(librosa.feature.mfcc(y=data,sr=sample_rate).T,axis=0)
output_result = np.hstack((output_result,mfcc_out))
root_mean_out = np.mean(librosa.feature.rms(y=data).T,axis=0)
output_result = np.hstack((output_result,root_mean_out))
mel_spectogram = np.mean(librosa.feature.melspectrogram(y=data,sr=sample_rate).T,axis=0)
output_result = np.hstack((output_result,mel_spectogram))
return output_result
def export_process(path):
data,sample_rate = librosa.load(path,duration=5,offset=1)
output_1 = extract_process(data)
result = np.array(output_1)
noise_out = noise(data)
output_2 = extract_process(noise_out)
result = np.vstack((result,output_2))
new_out = stretch(data)
strectch_pitch = pitch(new_out,sample_rate)
output_3 = extract_process(strectch_pitch)
result = np.vstack((result,output_3))
return result
# Load X_train from Google Drive
with open('X_train.pkl', 'rb') as f:
X_train = pickle.load(f)
# Load X_train from Google Drive
with open('Y_train.pkl', 'rb') as f:
Y_train = pickle.load(f)
Features = pd.DataFrame(X_train)
Features['labels'] = Y_train
X = Features.iloc[: ,:-1].values
Y = Features['labels'].values
encoder_label = OneHotEncoder()
Y = encoder_label.fit_transform(np.array(Y).reshape(-1,1)).toarray()
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.9, random_state=42, shuffle=True)
scaler_data = StandardScaler()
x_train = scaler_data.fit_transform(x_train)
x_test = scaler_data.transform(x_test)
def preprocess_audio(audio):
#data, sample_rate = librosa.load(audio, duration=2.5, offset=0.6)
features = export_process(audio)
features = scaler_data.transform(features)
return np.expand_dims(features, axis=2)
# Function to predict emotion from preprocessed audio
def predict_emotion(preprocessed_audio):
model = load_model('speech-emotion-recognition.hdf5')
prediction = model.predict(preprocessed_audio)
predicted_emotion = encoder_label.inverse_transform(prediction)
return predicted_emotion[0]
# Live emotion recognition
def live_emotion_recognition(audio_path):
# Preprocess live audio
preprocessed_audio = preprocess_audio(audio_path)
# Predict emotion
predicted_emotion = predict_emotion(preprocessed_audio)
#print("Predicted Emotion:", predicted_emotion)
return predicted_emotion[0] |