hf-speech-eval / face_emotion_detection.py
space-sue's picture
initial commit
9310327
import argparse
import datetime
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import cv2
from PIL import Image
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential, load_model, model_from_json
from tensorflow.compat.v1.keras.backend import set_session
from facial_analysis import FacialImageProcessing
class NpEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
if isinstance(obj, np.ndarray):
return obj.tolist()
return super(NpEncoder, self).default(obj)
def initialize():
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
set_session(sess)
def mobilenet_preprocess_input(x, **kwargs):
x[..., 0] -= 103.939
x[..., 1] -= 116.779
x[..., 2] -= 123.68
return x
def detect_emotion(frame_bgr):
imgProcessing = FacialImageProcessing(False)
model = load_model('./models/affectnet_emotions/mobilenet_7.h5')
# print(model.summary())
preprocessing_function = mobilenet_preprocess_input
INPUT_SIZE = (224, 224)
idx_to_class = {0: 'Anger', 1: 'Disgust', 2: 'Fear',
3: 'Happiness', 4: 'Neutral', 5: 'Sadness', 6: 'Surprise'}
frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
bounding_boxes, points = imgProcessing.detect_faces(frame)
points = points.T
detections = {"id": str(datetime.datetime.now())}
for bbox, p in zip(bounding_boxes, points):
face_pred = {}
box = bbox.astype(np.int)
x1, y1, x2, y2 = box[0:4]
face_img = frame[y1:y2, x1:x2, :]
try:
face_img = cv2.resize(face_img, INPUT_SIZE)
except:
break
inp = face_img.astype(np.float32)
inp[..., 0] -= 103.939
inp[..., 1] -= 116.779
inp[..., 2] -= 123.68
inp = np.expand_dims(inp, axis=0)
scores = model.predict(inp)[0]
frame = cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 9, 12), 4)
cv2.putText(frame, idx_to_class[np.argmax(scores)] + ' ' + str(scores[np.argmax(
scores)]), (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)
face_pred["face_bbox"] = [x1,y1,x2,y2]
face_pred["emotion_predicted"] = idx_to_class[np.argmax(scores)]
all_scores = {}
for i in range(len(scores)):
all_scores[str(idx_to_class[i])] = scores[i]
face_pred["scores"] = all_scores
detections["face"] = face_pred
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
print(detections)
return frame, detections
def process_video(video):
basename = os.path.basename(video)
name_only = os.path.splitext(basename)[0]
video_outputpath = os.path.join('./output',basename)
json_outputpath = os.path.join('./output',name_only + '.json')
# Writing to sample.json
with open(json_outputpath, "w") as jsonfile:
videocap = cv2.VideoCapture(video) # fpath)
ret, frame = videocap.read()
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
fps = 24.0
size = (frame.shape[1], frame.shape[0])
out = cv2.VideoWriter(video_outputpath, fourcc, fps, size)
# for i in range(len(image_array)):
# out.write(image_array[i])
max_frame = 500
cnt = 0
while ret == True and cnt < 50:
processed_frame, detections = detect_emotion(frame)
json_object = json.dumps(detections, indent=4, cls=NpEncoder)
jsonfile.write(json_object)
cv2.imshow('img', np.array(processed_frame, dtype=np.uint8))
out.write(processed_frame)
ret, frame = videocap.read()
cv2.waitKey(1)
cnt += 1
videocap.release()
cv2.destroyAllWindows()
return out
def main():
parser = argparse.ArgumentParser(description='Analysis of Video')
parser.add_argument(
'-v', '--video', help='Video to be analysed', required=True)
args = parser.parse_args()
process_video(args.video)
if __name__ == '__main__':
main()