Spaces:

THP2903
/

DPL-Project

Sleeping

App Files Files Community

DPL-Project / app.py

THP2903

Update app.py

2eef7a9 verified over 1 year ago

raw

history blame contribute delete

3.78 kB

	import gradio as gr
	import torch as pt
	import torchaudio
	import cv2
	import os
	import numpy as np
	import tensorflow as tf
	from tensorflow.keras.models import load_model
	from moviepy.editor import VideoFileClip

	def convert_video_to_audio_moviepy(video_file, output_ext="wav"):
	"""Converts video to audio using MoviePy library that uses `ffmpeg` under the hood"""
	filename, ext = os.path.splitext(video_file)
	clip = VideoFileClip(video_file)
	audio_path = f"{filename}.{output_ext}"
	clip.audio.write_audiofile(audio_path)
	return audio_path

	def process_video_audio(video_path):
	audio_path = convert_video_to_audio_moviepy(video_path)

	wav, sr = torchaudio.load(audio_path)

	train_visual = pt.zeros([1, 120, 120, 3, 10])
	train_audio_wave = pt.zeros([1, 261540])
	train_audio_cnn = pt.zeros([1, 150, 512, 1])


	mfcc = torchaudio.transforms.MFCC(n_mfcc=150, melkwargs={"n_fft": 1022, "n_mels": 150})

	face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')


	if len(wav[0]) > 261540:
	print(wav.shape)
	train_audio_wave[0, :] = wav[0][:261540]

	else:

	print(wav.shape)
	train_audio_wave[0, :len(wav[0])] = wav[0][:]
	train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])

	print(train_audio_cnn[0].shape)

	cap = cv2.VideoCapture(video_path)
	frame_idx = 0
	last_frame = None
	for i in range(100):
	ret, frame = cap.read()
	if ret and (i % 10 == 0):
	gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
	faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
	if len(faces) > 0:
	(x, y, w, h) = faces[0]
	face = frame[y:y+h, x:x+w]
	resized_face = cv2.resize(face, (120, 120))
	train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_face)
	else:
	resized_frame = cv2.resize(frame, (120, 120))
	train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame)
	last_frame = frame
	frame_idx += 1
	cap.release()

	train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16)
	train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077))
	train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16)

	return last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn

	def predict_emotion(video_path):
	last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path)

	model = load_model("model_vui_ve2392.keras")

	predictions = model.predict({
	"input_visual": train_visual,
	"input_audio_cnn": train_audio_cnn,
	"input_audio_wave": train_audio_wave
	})

	predicted_label = np.argmax(predictions)
	return last_frame, audio_path, predicted_label

	def predict_emotion_gradio(video_path):
	emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'}
	last_frame, audio_path, predicted_label = predict_emotion(video_path)
	predicted_emotion = emotion_dict[predicted_label]
	return last_frame, audio_path, predicted_emotion


	iface = gr.Interface(
	fn=predict_emotion_gradio,
	inputs=[
	gr.Video(label="Upload a video")
	],
	outputs=[
	gr.Image(label="Last Frame"),
	gr.Audio(label = "Audio"),
	gr.Textbox(label="Predicted Emotion")
	],
	title="Emotion recognition with multi-modal neural network",
	description="Upload a video and get the predicted emotion."
	)

	iface.launch()