Spaces:
Sleeping
Sleeping
import math | |
import os | |
from io import BytesIO | |
import gradio as gr | |
import cv2 | |
from PIL import Image | |
import requests | |
from transformers import pipeline | |
from pydub import AudioSegment | |
from faster_whisper import WhisperModel | |
import joblib | |
import mediapipe as mp | |
import numpy as np | |
import pandas as pd | |
import moviepy.editor as mpe | |
theme = gr.themes.Base( | |
primary_hue="cyan", | |
secondary_hue="blue", | |
neutral_hue="slate", | |
) | |
model = WhisperModel("small", device="cpu", compute_type="int8") | |
body_lang_model = joblib.load('body_language.pkl') | |
mp_holistic = mp.solutions.holistic | |
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) | |
mp_face_mesh = mp.solutions.face_mesh | |
face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5) | |
API_KEY = os.getenv('HF_API_KEY') | |
pipe1 = pipeline("image-classification", model="dima806/facial_emotions_image_detection") | |
pipe2 = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions") | |
AUDIO_API_URL = "https://api-inference.huggingface.co/models/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" | |
headers = {"Authorization": "Bearer " + API_KEY + ""} | |
def extract_frames(video_path): | |
clip = mpe.VideoFileClip(video_path) | |
clip.write_videofile('mp4file.mp4', fps=60) | |
cap = cv2.VideoCapture('mp4file.mp4') | |
fps = int(cap.get(cv2.CAP_PROP_FPS)) | |
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
interval = int(fps/2) | |
print(interval, total_frames) | |
result = [] | |
distract_count = 0 | |
total_count = 0 | |
output_list = [] | |
for i in range(0, total_frames, interval): | |
total_count += 1 | |
cap.set(cv2.CAP_PROP_POS_FRAMES, i) | |
ret, frame = cap.read() | |
if ret: | |
image = cv2.cvtColor(cv2.flip(frame, 1), cv2.COLOR_BGR2RGB) | |
image.flags.writeable = False | |
results = face_mesh.process(image) | |
image.flags.writeable = True | |
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) | |
img_h, img_w, img_c = image.shape | |
face_3d = [] | |
face_2d = [] | |
flag = False | |
if results.multi_face_landmarks: | |
for face_landmarks in results.multi_face_landmarks: | |
for idx, lm in enumerate(face_landmarks.landmark): | |
if idx == 33 or idx == 263 or idx == 1 or idx == 61 or idx == 291 or idx == 199: | |
if idx == 1: | |
nose_2d = (lm.x * img_w, lm.y * img_h) | |
nose_3d = (lm.x * img_w, lm.y * img_h, lm.z * 3000) | |
x, y = int(lm.x * img_w), int(lm.y * img_h) | |
face_2d.append([x, y]) | |
face_3d.append([x, y, lm.z]) | |
face_2d = np.array(face_2d, dtype=np.float64) | |
face_3d = np.array(face_3d, dtype=np.float64) | |
focal_length = 1 * img_w | |
cam_matrix = np.array([ [focal_length, 0, img_h / 2], | |
[0, focal_length, img_w / 2], | |
[0, 0, 1]]) | |
dist_matrix = np.zeros((4, 1), dtype=np.float64) | |
success, rot_vec, trans_vec = cv2.solvePnP(face_3d, face_2d, cam_matrix, dist_matrix) | |
rmat, jac = cv2.Rodrigues(rot_vec) | |
angles, mtxR, mtxQ, Qx, Qy, Qz = cv2.RQDecomp3x3(rmat) | |
x = angles[0] * 360 | |
y = angles[1] * 360 | |
z = angles[2] * 360 | |
if y < -7 or y > 7 or x < -7 or x > 7: | |
flag = True | |
else: | |
flag = False | |
if flag == True: | |
distract_count += 1 | |
image2 = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
results2 = holistic.process(image2) | |
pose = results2.pose_landmarks.landmark | |
pose_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in pose]).flatten()) | |
face = results2.face_landmarks.landmark | |
face_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in face]).flatten()) | |
row = pose_row+face_row | |
X = pd.DataFrame([row]) | |
body_language_class = body_lang_model.predict(X)[0] | |
body_language_prob = body_lang_model.predict_proba(X)[0] | |
output_dict = {} | |
for class_name, prob in zip(body_lang_model.classes_, body_language_prob): | |
output_dict[class_name] = prob | |
output_list.append(output_dict) | |
pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) | |
response = pipe1(pil_image) | |
temp = {} | |
for ele in response: | |
label, score = ele.values() | |
temp[label] = score | |
result.append(temp) | |
distraction_rate = distract_count/total_count | |
total_bad_prob = 0 | |
total_good_prob = 0 | |
for output_dict in output_list: | |
total_bad_prob += output_dict['Bad'] | |
total_good_prob += output_dict['Good'] | |
num_frames = len(output_list) | |
avg_bad_prob = total_bad_prob / num_frames | |
avg_good_prob = total_good_prob / num_frames | |
final_output = {'Bad': avg_bad_prob, 'Good': avg_good_prob} | |
cap.release() | |
video_emotion_totals = {} | |
emotion_totals = { 'admiration': 0.0, 'amusement': 0.0, 'angry': 0.0, 'annoyance': 0.0, 'approval': 0.0, 'caring': 0.0, 'confusion': 0.0, 'curiosity': 0.0, 'desire': 0.0, 'disappointment': 0.0, 'disapproval': 0.0, 'disgust': 0.0, 'embarrassment': 0.0, 'excitement': 0.0, 'fear': 0.0, 'gratitude': 0.0, 'grief': 0.0, 'happy': 0.0, 'love': 0.0, 'nervousness': 0.0, 'optimism': 0.0, 'pride': 0.0, 'realization': 0.0, 'relief': 0.0, 'remorse': 0.0, 'sad': 0.0, 'surprise': 0.0, 'neutral': 0.0 } | |
counter = 0 | |
for ele in result: | |
for emotion in ele.keys(): | |
emotion_totals[emotion] += ele.get(emotion) | |
counter += 1 | |
for emotion in emotion_totals: | |
emotion_totals[emotion] /= counter | |
if (emotion_totals[emotion]) > 0.0: | |
video_emotion_totals[emotion] = emotion_totals[emotion] | |
return video_emotion_totals, result, final_output, distraction_rate | |
def analyze_sentiment(text): | |
response = pipe2(text) | |
sentiment_results = {} | |
for ele in response: | |
label, score = ele.values() | |
sentiment_results[label] = score | |
return sentiment_results | |
def video_to_audio(input_video): | |
video_emotion_totals, frames_sentiments, body_language, distraction_rate = extract_frames(input_video) | |
print("Total Video Emotions ... Done") | |
print("Video Frame Sentiment ... Done") | |
print("Body Language ... Done") | |
print("Distraction Rate ... Done") | |
cap = cv2.VideoCapture(input_video) | |
fps = int(cap.get(cv2.CAP_PROP_FPS)) | |
audio = AudioSegment.from_file(input_video) | |
audio_binary = audio.export(format="wav").read() | |
audio_bytesio = BytesIO(audio_binary) | |
audio_bytesio2 = BytesIO(audio_binary) | |
response = requests.post(AUDIO_API_URL, headers=headers, data=audio_bytesio) | |
formatted_response = {} | |
for ele in response.json(): | |
score, label = ele.values() | |
formatted_response[label] = score | |
print("Speech Sentiments ... Done") | |
segments, info = model.transcribe(audio_bytesio2, beam_size=5) | |
transcript = '' | |
video_sentiment_final = [] | |
final_output = [] | |
for segment in segments: | |
transcript = transcript + segment.text + " " | |
transcript_segment_sentiment = analyze_sentiment(segment.text) | |
emotion_totals = { | |
'admiration': 0.0, | |
'amusement': 0.0, | |
'angry': 0.0, | |
'annoyance': 0.0, | |
'approval': 0.0, | |
'caring': 0.0, | |
'confusion': 0.0, | |
'curiosity': 0.0, | |
'desire': 0.0, | |
'disappointment': 0.0, | |
'disapproval': 0.0, | |
'disgust': 0.0, | |
'embarrassment': 0.0, | |
'excitement': 0.0, | |
'fear': 0.0, | |
'gratitude': 0.0, | |
'grief': 0.0, | |
'happy': 0.0, | |
'love': 0.0, | |
'nervousness': 0.0, | |
'optimism': 0.0, | |
'pride': 0.0, | |
'realization': 0.0, | |
'relief': 0.0, | |
'remorse': 0.0, | |
'sad': 0.0, | |
'surprise': 0.0, | |
'neutral': 0.0 | |
} | |
counter = 0 | |
for i in range(math.ceil(segment.start), math.floor(segment.end)): | |
for emotion in frames_sentiments[i].keys(): | |
emotion_totals[emotion] += frames_sentiments[i].get(emotion) | |
counter += 1 | |
for emotion in emotion_totals: | |
emotion_totals[emotion] /= counter | |
video_sentiment_final.append(emotion_totals) | |
video_segment_sentiment = {key: value for key, value in emotion_totals.items() if value != 0.0} | |
segment_finals = {segment.id: (segment.text, segment.start, segment.end, transcript_segment_sentiment, video_segment_sentiment)} | |
final_output.append(segment_finals) | |
total_transcript_sentiment = {key: value for key, value in analyze_sentiment(transcript).items() if value >= 0.01} | |
print("Full Transcript Sentiments ... Done") | |
emotion_finals = { | |
'admiration': 0.0, | |
'amusement': 0.0, | |
'angry': 0.0, | |
'annoyance': 0.0, | |
'approval': 0.0, | |
'caring': 0.0, | |
'confusion': 0.0, | |
'curiosity': 0.0, | |
'desire': 0.0, | |
'disappointment': 0.0, | |
'disapproval': 0.0, | |
'disgust': 0.0, | |
'embarrassment': 0.0, | |
'excitement': 0.0, | |
'fear': 0.0, | |
'gratitude': 0.0, | |
'grief': 0.0, | |
'happy': 0.0, | |
'love': 0.0, | |
'nervousness': 0.0, | |
'optimism': 0.0, | |
'pride': 0.0, | |
'realization': 0.0, | |
'relief': 0.0, | |
'remorse': 0.0, | |
'sad': 0.0, | |
'surprise': 0.0, | |
'neutral': 0.0 | |
} | |
for i in range(0, video_sentiment_final.__len__()-1): | |
for emotion in video_sentiment_final[i].keys(): | |
emotion_finals[emotion] += video_sentiment_final[i].get(emotion) | |
for emotion in emotion_finals: | |
emotion_finals[emotion] /= video_sentiment_final.__len__() | |
emotion_finals = {key: value for key, value in emotion_finals.items() if value != 0.0} | |
print("Video Frame (Mapping & AVG.) ... Done") | |
print("\nProcessing Completed!!\n") | |
payload = { | |
'from': 'gradio', | |
'total_video_emotions': video_emotion_totals, | |
'emotions_final': emotion_finals, | |
'body_language': body_language, | |
'distraction_rate': distraction_rate, | |
'formatted_response': formatted_response, | |
'total_transcript_sentiment': total_transcript_sentiment | |
} | |
print(payload) | |
response = requests.post('http://127.0.0.1:5000/interview', json=payload) | |
with gr.Blocks(theme=theme, css=".gradio-container { background: rgba(255, 255, 255, 0.2) !important; box-shadow: 0 8px 32px 0 rgba( 31, 38, 135, 0.37 ) !important; backdrop-filter: blur( 10px ) !important; -webkit-backdrop-filter: blur( 10px ) !important; border-radius: 10px !important; border: 1px solid rgba( 0, 0, 0, 0.5 ) !important;}") as Video: | |
input_video = gr.Video(sources=["upload", "webcam"], format='mp4') | |
input_video.stop_recording(fn=video_to_audio, inputs=input_video) | |
Video.launch() |