Spaces:
Runtime error
Runtime error
import os | |
from json_tricks import load | |
import numpy as np | |
import librosa | |
from pydub import AudioSegment, effects | |
import noisereduce as nr | |
import tensorflow as tf | |
import keras | |
from keras.models import model_from_json | |
from keras.models import load_model | |
import matplotlib.pyplot as plt | |
import warnings | |
warnings.filterwarnings('ignore') | |
saved_model_path = r'./model8723.json' | |
saved_weights_path = r'./model8723_weights.h5' | |
#Reading the model from JSON file | |
with open(saved_model_path, 'r') as json_file: | |
json_savedModel = json_file.read() | |
# Loading the model architecture, weights | |
model = tf.keras.models.model_from_json(json_savedModel) | |
model.load_weights(saved_weights_path) | |
# Compiling the model with similar parameters as the original model. | |
model.compile(loss='categorical_crossentropy', | |
optimizer='RMSProp', | |
metrics=['categorical_accuracy']) | |
print(model.summary()) | |
def convert(y,sr): | |
# convert from float to uint16 | |
y = np.array(y * (1<<15), dtype=np.int16) | |
audio_segment = AudioSegment( | |
y.tobytes(), | |
frame_rate=sr, | |
sample_width=y.dtype.itemsize, | |
channels=1 | |
) | |
return audio_segment | |
def preprocess(y,sr ): | |
''' | |
A process to an audio .wav file before execcuting a prediction. | |
Arguments: | |
- file_path - The system path to the audio file. | |
- frame_length - Length of the frame over which to compute the speech features. default: 2048 | |
- hop_length - Number of samples to advance for each frame. default: 512 | |
Return: | |
'X_3D' variable, containing a shape of: (batch, timesteps, feature) for a single file (batch = 1). | |
''' | |
total_length = 204288 | |
frame_length = 2048 | |
hop_length = 512 | |
# Fetch sample rate. | |
# _, sr = librosa.load(path = file_path, sr = None) | |
# Load audio file | |
rawsound = convert(y,sr) | |
# y = y.astype(np.float32) | |
# y /= np.max(np.abs(y)) | |
# rawsound = AudioSegment.from_mono_audiosegments(y) | |
# Normalize to 5 dBFS | |
normalizedsound = effects.normalize(rawsound, headroom = 5.0) | |
# Transform the audio file to np.array of samples | |
normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32') | |
final_x = nr.reduce_noise(normal_x, sr=sr) #updated 03/03/22 | |
# Features extraction | |
f1 = librosa.feature.rms(y = final_x, frame_length=frame_length, hop_length=hop_length,center=True,pad_mode='reflect').T # Energy - Root Mean Square | |
f2 = librosa.feature.zero_crossing_rate(final_x , frame_length=frame_length, hop_length=hop_length, center=True).T # ZCR | |
f3 = librosa.feature.mfcc(y = final_x, sr=sr, n_mfcc=13, hop_length = hop_length).T # MFCC | |
X = np.concatenate((f1, f2, f3), axis = 1) | |
# Pad the array | |
padding_rows = 448-len(X) | |
if padding_rows < 0: | |
X = X[:448, :15] | |
if padding_rows > 0: | |
X = np.vstack(( X, np.zeros((padding_rows, 15)))) | |
X_3D = np.expand_dims(X, axis=0) | |
return X_3D | |
emotions = { | |
0 : 'neutral', | |
1 : 'calm', | |
2 : 'happy', | |
3 : 'sad', | |
4 : 'angry', | |
5 : 'fearful', | |
6 : 'disgust', | |
7 : 'suprised' | |
} | |
emo_list = list(emotions.values()) | |
def is_silent(data): | |
# Returns 'True' if below the 'silent' threshold | |
return max(data) < 100 | |
import pyaudio | |
import wave | |
from array import array | |
import struct | |
import time | |
# Initialize variables | |
RATE = 24414 | |
CHUNK = 512 | |
RECORD_SECONDS = 7.1 | |
CHANNELS = 1 | |
WAVE_OUTPUT_FILE = "./output.wav" | |
def EmotionRecogniser(stream,new_chunk): | |
# process only when stream gets to length 7.1 seconds, else donot update prediction yet | |
sr, y = new_chunk | |
y = y.astype(np.float32) | |
y /= np.max(np.abs(y)) | |
# SESSION START | |
print("** session started") | |
total_predictions = [] # A list for all predictions in the session. | |
if stream is not None: | |
stream = np.concatenate([stream, y]) | |
else: | |
stream = y | |
# if len(stream) < int(RATE*RECORD_SECONDS): | |
# return stream, 'neutral' | |
x = preprocess(y=stream,sr =sr) # 'output.wav' file preprocessing. | |
print('x shape:', x.shape) | |
# Model's prediction => an 8 emotion probabilities array. | |
predictions = model.predict(x, use_multiprocessing=True) | |
pred_list = list(predictions) | |
pred_np = np.squeeze(np.array(pred_list).tolist(), axis=0) # Get rid of 'array' & 'dtype' statments. | |
total_predictions.append(pred_np) | |
#dict of emotions with their respective probabilities | |
emotions_prob = dict(zip(emo_list, pred_np)) | |
max_emo = np.argmax(predictions) | |
print('max emotion:', emotions.get(max_emo,-1)) | |
stream = stream[len(y):] # Reset the stream for the next session. | |
emotions_prob | |
return stream , emotions_prob | |
# Present emotion distribution for the whole session. | |
# total_predictions_np = np.mean(np.array(total_predictions).tolist(), axis=0) | |
# fig = plt.figure(figsize = (10, 5)) | |
# plt.bar(emo_list, total_predictions_np, color = 'indigo') | |
# plt.ylabel("Mean probabilty (%)") | |
# plt.title("Session Summary") | |
# plt.show() | |
# print(f"Emotions analyzed for: {(toc - tic):0.4f} seconds") | |
# return str(emotions.get(np.argmax(total_predictions_np),-1)) | |
################################################## | |
import gradio as gr | |
from transformers import pipeline | |
import numpy as np | |
# transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") | |
# def transcribe(stream, new_chunk): | |
# sr, y = new_chunk | |
# y = y.astype(np.float32) | |
# y /= np.max(np.abs(y)) | |
# if stream is not None: | |
# stream = np.concatenate([stream, y]) | |
# else: | |
# stream = y | |
# return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"] | |
demo = gr.Interface( | |
EmotionRecogniser, | |
["state",gr.Audio(sources=["microphone"], streaming=True,every=1.0)], | |
["state",'label'], | |
live=True, | |
) | |
demo.launch() | |