File size: 3,608 Bytes
9910ecc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fa244e
9910ecc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
import numpy as np

import librosa

import sklearn
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras.models import load_model

import pickle 

sample_rate = 22050

def noise(data):
    noise_value = 0.015 * np.random.uniform() * np.amax(data)
    data = data + noise_value * np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate=rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high=5) * 1000)
    return np.roll(data, shift_range)

def pitch(data,sampling_rate,pitch_factor=0.7):
    return librosa.effects.pitch_shift(data,sr=sampling_rate, n_steps=pitch_factor)

def extract_process(data):

    sample_rate = 22050
    output_result = np.array([])
    mean_zero = np.mean(librosa.feature.zero_crossing_rate(y=data).T,axis=0)
    output_result = np.hstack((output_result,mean_zero))

    stft_out = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft_out,sr=sample_rate).T,axis=0)
    output_result = np.hstack((output_result,chroma_stft))

    mfcc_out = np.mean(librosa.feature.mfcc(y=data,sr=sample_rate).T,axis=0)
    output_result = np.hstack((output_result,mfcc_out))

    root_mean_out = np.mean(librosa.feature.rms(y=data).T,axis=0)
    output_result = np.hstack((output_result,root_mean_out))

    mel_spectogram = np.mean(librosa.feature.melspectrogram(y=data,sr=sample_rate).T,axis=0)
    output_result = np.hstack((output_result,mel_spectogram))

    return output_result

def export_process(path):

    data,sample_rate = librosa.load(path,duration=5,offset=1)

    output_1 = extract_process(data)
    result = np.array(output_1)

    noise_out = noise(data)
    output_2 = extract_process(noise_out)
    result = np.vstack((result,output_2))

    new_out = stretch(data)
    strectch_pitch = pitch(new_out,sample_rate)
    output_3 = extract_process(strectch_pitch)
    result = np.vstack((result,output_3))

    return result

# Load X_train from Google Drive
with open('X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)

# Load X_train from Google Drive
with open('Y_train.pkl', 'rb') as f:
    Y_train = pickle.load(f)

Features = pd.DataFrame(X_train)
Features['labels'] = Y_train

X = Features.iloc[: ,:-1].values
Y = Features['labels'].values

encoder_label = OneHotEncoder()
Y = encoder_label.fit_transform(np.array(Y).reshape(-1,1)).toarray()

x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.9, random_state=42, shuffle=True)

scaler_data = StandardScaler()
x_train = scaler_data.fit_transform(x_train)
x_test = scaler_data.transform(x_test)

def preprocess_audio(audio):
    #data, sample_rate = librosa.load(audio, duration=2.5, offset=0.6)
    features = export_process(audio)
    features = scaler_data.transform(features)
    return np.expand_dims(features, axis=2)

# Function to predict emotion from preprocessed audio
def predict_emotion(preprocessed_audio):
    model = load_model('speech-emotion-recognition.hdf5')
    prediction = model.predict(preprocessed_audio)
    predicted_emotion = encoder_label.inverse_transform(prediction)
    return predicted_emotion[0]

# Live emotion recognition
def live_emotion_recognition(audio_path):
    # Preprocess live audio
    preprocessed_audio = preprocess_audio(audio_path)
    # Predict emotion
    predicted_emotion = predict_emotion(preprocessed_audio)
    #print("Predicted Emotion:", predicted_emotion)
    return predicted_emotion[0]