from __future__ import absolute_import, division, print_function, unicode_literals from flask import Flask, make_response, render_template, request, jsonify, redirect, url_for, send_from_directory from flask_cors import CORS import sys import os import librosa import librosa.display import numpy as np import warnings import tensorflow as tf from keras.models import Sequential from keras.layers import Dense from keras.utils import to_categorical from keras.layers import Flatten, Dropout, Activation from keras.layers import Conv2D, MaxPooling2D from keras.layers.normalization import BatchNormalization from sklearn.model_selection import train_test_split from tqdm import tqdm # import scipy.io.wavfile as wav # from speechpy.feature import mfcc import pyaudio import wave warnings.filterwarnings("ignore") app = Flask(__name__) CORS(app) classLabels = ('Angry', 'Fear', 'Disgust', 'Happy', 'Sad', 'Surprised', 'Neutral') numLabels = len(classLabels) in_shape = (39,216) model = Sequential() model.add(Conv2D(8, (13, 13), input_shape=(in_shape[0], in_shape[1], 1))) model.add(BatchNormalization(axis=-1)) model.add(Activation('relu')) model.add(Conv2D(8, (13, 13))) model.add(BatchNormalization(axis=-1)) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 1))) model.add(Conv2D(8, (3, 3))) model.add(BatchNormalization(axis=-1)) model.add(Activation('relu')) model.add(Conv2D(8, (1, 1))) model.add(BatchNormalization(axis=-1)) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 1))) model.add(Flatten()) model.add(Dense(64)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(numLabels, activation='softmax')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # print(model.summary(), file=sys.stderr) model.load_weights('speech_emotion_detection_ravdess_savee.h5') def detect_emotion(file_name): X, sample_rate = librosa.load(file_name, res_type='kaiser_best',duration=2.5,sr=22050*2,offset=0.5) sample_rate = np.array(sample_rate) mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=39) feature = mfccs print("Feature_shape =>",feature.shape) feature = feature.reshape(39, 216, 1) result = classLabels[np.argmax(model.predict(np.array([feature])))] print("Result ==> ",result) return result @app.route("/speech-emotion-recognition/") def emotion_detection(): filename = 'audio_files/Happy.wav' result = detect_emotion(filename) return result @app.route("/record_audio/") def record_audio(): CHUNK = 1024 FORMAT = pyaudio.paInt16 #paInt8 CHANNELS = 2 RATE = 44100 #sample rate RECORD_SECONDS = 4 fileList = os.listdir('recorded_audio') print("Audio File List ==> ",fileList) new_wav_file = "" if(fileList): filename_list = [] for i in fileList: print(i) filename = i.split('.')[0] filename_list.append(filename) max_file = max(filename_list) print(type(max_file)) new_wav_file = int(max_file) + 1 else: new_wav_file="1" new_wav_file = str(new_wav_file) + ".wav" filepath = os.path.join('recorded_audio', new_wav_file) WAVE_OUTPUT_FILENAME = filepath print(WAVE_OUTPUT_FILENAME) p = pyaudio.PyAudio() stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) #buffer print("* recording") frames = [] for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): data = stream.read(CHUNK) frames.append(data) # 2 bytes(16 bits) per channel print("* done recording") stream.stop_stream() stream.close() p.terminate() wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb') wf.setnchannels(CHANNELS) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(b''.join(frames)) wf.close() return "Audio Recorded" if __name__ == "__main__": app.run()