Spaces:

ML-with-Rajibul
/

Sentiment-based-Music-Therapy

Sleeping

File size: 3,608 Bytes

import pandas as pd
import numpy as np

import librosa

import sklearn
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras.models import load_model

import pickle 

sample_rate = 22050

def noise(data):
    noise_value = 0.015 * np.random.uniform() * np.amax(data)
    data = data + noise_value * np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate=rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high=5) * 1000)
    return np.roll(data, shift_range)

def pitch(data,sampling_rate,pitch_factor=0.7):
    return librosa.effects.pitch_shift(data,sr=sampling_rate, n_steps=pitch_factor)

def extract_process(data):

    sample_rate = 22050
    output_result = np.array([])
    mean_zero = np.mean(librosa.feature.zero_crossing_rate(y=data).T,axis=0)
    output_result = np.hstack((output_result,mean_zero))

    stft_out = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft_out,sr=sample_rate).T,axis=0)
    output_result = np.hstack((output_result,chroma_stft))

    mfcc_out = np.mean(librosa.feature.mfcc(y=data,sr=sample_rate).T,axis=0)
    output_result = np.hstack((output_result,mfcc_out))

    root_mean_out = np.mean(librosa.feature.rms(y=data).T,axis=0)
    output_result = np.hstack((output_result,root_mean_out))

    mel_spectogram = np.mean(librosa.feature.melspectrogram(y=data,sr=sample_rate).T,axis=0)
    output_result = np.hstack((output_result,mel_spectogram))

    return output_result

def export_process(path):

    data,sample_rate = librosa.load(path,duration=5,offset=1)

    output_1 = extract_process(data)
    result = np.array(output_1)

    noise_out = noise(data)
    output_2 = extract_process(noise_out)
    result = np.vstack((result,output_2))

    new_out = stretch(data)
    strectch_pitch = pitch(new_out,sample_rate)
    output_3 = extract_process(strectch_pitch)
    result = np.vstack((result,output_3))

    return result

# Load X_train from Google Drive
with open('X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)

# Load X_train from Google Drive
with open('Y_train.pkl', 'rb') as f:
    Y_train = pickle.load(f)

Features = pd.DataFrame(X_train)
Features['labels'] = Y_train

X = Features.iloc[: ,:-1].values
Y = Features['labels'].values

encoder_label = OneHotEncoder()
Y = encoder_label.fit_transform(np.array(Y).reshape(-1,1)).toarray()

x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.9, random_state=42, shuffle=True)

scaler_data = StandardScaler()
x_train = scaler_data.fit_transform(x_train)
x_test = scaler_data.transform(x_test)

def preprocess_audio(audio):
    #data, sample_rate = librosa.load(audio, duration=2.5, offset=0.6)
    features = export_process(audio)
    features = scaler_data.transform(features)
    return np.expand_dims(features, axis=2)

# Function to predict emotion from preprocessed audio
def predict_emotion(preprocessed_audio):
    model = load_model('speech-emotion-recognition.hdf5')
    prediction = model.predict(preprocessed_audio)
    predicted_emotion = encoder_label.inverse_transform(prediction)
    return predicted_emotion[0]

# Live emotion recognition
def live_emotion_recognition(audio_path):
    # Preprocess live audio
    preprocessed_audio = preprocess_audio(audio_path)
    # Predict emotion
    predicted_emotion = predict_emotion(preprocessed_audio)
    #print("Predicted Emotion:", predicted_emotion)
    return predicted_emotion[0]