DPL-Project / app.py
THP2903's picture
Update app.py
2eef7a9 verified
import gradio as gr
import torch as pt
import torchaudio
import cv2
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from moviepy.editor import VideoFileClip
def convert_video_to_audio_moviepy(video_file, output_ext="wav"):
"""Converts video to audio using MoviePy library that uses `ffmpeg` under the hood"""
filename, ext = os.path.splitext(video_file)
clip = VideoFileClip(video_file)
audio_path = f"{filename}.{output_ext}"
clip.audio.write_audiofile(audio_path)
return audio_path
def process_video_audio(video_path):
audio_path = convert_video_to_audio_moviepy(video_path)
wav, sr = torchaudio.load(audio_path)
train_visual = pt.zeros([1, 120, 120, 3, 10])
train_audio_wave = pt.zeros([1, 261540])
train_audio_cnn = pt.zeros([1, 150, 512, 1])
mfcc = torchaudio.transforms.MFCC(n_mfcc=150, melkwargs={"n_fft": 1022, "n_mels": 150})
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
if len(wav[0]) > 261540:
print(wav.shape)
train_audio_wave[0, :] = wav[0][:261540]
else:
print(wav.shape)
train_audio_wave[0, :len(wav[0])] = wav[0][:]
train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0])
print(train_audio_cnn[0].shape)
cap = cv2.VideoCapture(video_path)
frame_idx = 0
last_frame = None
for i in range(100):
ret, frame = cap.read()
if ret and (i % 10 == 0):
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
if len(faces) > 0:
(x, y, w, h) = faces[0]
face = frame[y:y+h, x:x+w]
resized_face = cv2.resize(face, (120, 120))
train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_face)
else:
resized_frame = cv2.resize(frame, (120, 120))
train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame)
last_frame = frame
frame_idx += 1
cap.release()
train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16)
train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077))
train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16)
return last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn
def predict_emotion(video_path):
last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path)
model = load_model("model_vui_ve2392.keras")
predictions = model.predict({
"input_visual": train_visual,
"input_audio_cnn": train_audio_cnn,
"input_audio_wave": train_audio_wave
})
predicted_label = np.argmax(predictions)
return last_frame, audio_path, predicted_label
def predict_emotion_gradio(video_path):
emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'}
last_frame, audio_path, predicted_label = predict_emotion(video_path)
predicted_emotion = emotion_dict[predicted_label]
return last_frame, audio_path, predicted_emotion
iface = gr.Interface(
fn=predict_emotion_gradio,
inputs=[
gr.Video(label="Upload a video")
],
outputs=[
gr.Image(label="Last Frame"),
gr.Audio(label = "Audio"),
gr.Textbox(label="Predicted Emotion")
],
title="Emotion recognition with multi-modal neural network",
description="Upload a video and get the predicted emotion."
)
iface.launch()