Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch as pt | |
| import torchaudio | |
| import cv2 | |
| import os | |
| import numpy as np | |
| import tensorflow as tf | |
| from tensorflow.keras.models import load_model | |
| from moviepy.editor import VideoFileClip | |
| def convert_video_to_audio_moviepy(video_file, output_ext="wav"): | |
| """Converts video to audio using MoviePy library that uses `ffmpeg` under the hood""" | |
| filename, ext = os.path.splitext(video_file) | |
| clip = VideoFileClip(video_file) | |
| audio_path = f"{filename}.{output_ext}" | |
| clip.audio.write_audiofile(audio_path) | |
| return audio_path | |
| def process_video_audio(video_path): | |
| audio_path = convert_video_to_audio_moviepy(video_path) | |
| wav, sr = torchaudio.load(audio_path) | |
| train_visual = pt.zeros([1, 120, 120, 3, 10]) | |
| train_audio_wave = pt.zeros([1, 261540]) | |
| train_audio_cnn = pt.zeros([1, 150, 512, 1]) | |
| mfcc = torchaudio.transforms.MFCC(n_mfcc=150, melkwargs={"n_fft": 1022, "n_mels": 150}) | |
| face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml') | |
| if len(wav[0]) > 261540: | |
| print(wav.shape) | |
| train_audio_wave[0, :] = wav[0][:261540] | |
| else: | |
| print(wav.shape) | |
| train_audio_wave[0, :len(wav[0])] = wav[0][:] | |
| train_audio_cnn[0, :, :, 0] = mfcc(train_audio_wave[0]) | |
| print(train_audio_cnn[0].shape) | |
| cap = cv2.VideoCapture(video_path) | |
| frame_idx = 0 | |
| last_frame = None | |
| for i in range(100): | |
| ret, frame = cap.read() | |
| if ret and (i % 10 == 0): | |
| gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) | |
| faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)) | |
| if len(faces) > 0: | |
| (x, y, w, h) = faces[0] | |
| face = frame[y:y+h, x:x+w] | |
| resized_face = cv2.resize(face, (120, 120)) | |
| train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_face) | |
| else: | |
| resized_frame = cv2.resize(frame, (120, 120)) | |
| train_visual[0, :, :, :, frame_idx] = pt.tensor(resized_frame) | |
| last_frame = frame | |
| frame_idx += 1 | |
| cap.release() | |
| train_visual = tf.convert_to_tensor(train_visual.numpy(), dtype=tf.float16) | |
| train_audio_wave = tf.reshape(tf.convert_to_tensor(train_audio_wave.numpy(), dtype=tf.float16), (1, 20, 13077)) | |
| train_audio_cnn = tf.convert_to_tensor(train_audio_cnn.numpy(), dtype=tf.float16) | |
| return last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn | |
| def predict_emotion(video_path): | |
| last_frame, audio_path, train_visual, train_audio_wave, train_audio_cnn = process_video_audio(video_path) | |
| model = load_model("model_vui_ve2392.keras") | |
| predictions = model.predict({ | |
| "input_visual": train_visual, | |
| "input_audio_cnn": train_audio_cnn, | |
| "input_audio_wave": train_audio_wave | |
| }) | |
| predicted_label = np.argmax(predictions) | |
| return last_frame, audio_path, predicted_label | |
| def predict_emotion_gradio(video_path): | |
| emotion_dict = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'} | |
| last_frame, audio_path, predicted_label = predict_emotion(video_path) | |
| predicted_emotion = emotion_dict[predicted_label] | |
| return last_frame, audio_path, predicted_emotion | |
| iface = gr.Interface( | |
| fn=predict_emotion_gradio, | |
| inputs=[ | |
| gr.Video(label="Upload a video") | |
| ], | |
| outputs=[ | |
| gr.Image(label="Last Frame"), | |
| gr.Audio(label = "Audio"), | |
| gr.Textbox(label="Predicted Emotion") | |
| ], | |
| title="Emotion recognition with multi-modal neural network", | |
| description="Upload a video and get the predicted emotion." | |
| ) | |
| iface.launch() | |