import math import os from io import BytesIO import gradio as gr import cv2 from PIL import Image import requests from transformers import pipeline from pydub import AudioSegment from faster_whisper import WhisperModel import joblib import mediapipe as mp import numpy as np import pandas as pd import moviepy.editor as mpe theme = gr.themes.Base( primary_hue="cyan", secondary_hue="blue", neutral_hue="slate", ) model = WhisperModel("small", device="cpu", compute_type="int8") body_lang_model = joblib.load('body_language.pkl') mp_holistic = mp.solutions.holistic holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) mp_face_mesh = mp.solutions.face_mesh face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5) API_KEY = os.getenv('HF_API_KEY') pipe1 = pipeline("image-classification", model="dima806/facial_emotions_image_detection") pipe2 = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions") AUDIO_API_URL = "https://api-inference.huggingface.co/models/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" headers = {"Authorization": "Bearer " + API_KEY + ""} def extract_frames(video_path): clip = mpe.VideoFileClip(video_path) clip.write_videofile('mp4file.mp4', fps=60) cap = cv2.VideoCapture('mp4file.mp4') fps = int(cap.get(cv2.CAP_PROP_FPS)) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) interval = int(fps/2) print(interval, total_frames) result = [] distract_count = 0 total_count = 0 output_list = [] for i in range(0, total_frames, interval): total_count += 1 cap.set(cv2.CAP_PROP_POS_FRAMES, i) ret, frame = cap.read() if ret: image = cv2.cvtColor(cv2.flip(frame, 1), cv2.COLOR_BGR2RGB) image.flags.writeable = False results = face_mesh.process(image) image.flags.writeable = True image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) img_h, img_w, img_c = image.shape face_3d = [] face_2d = [] flag = False if results.multi_face_landmarks: for face_landmarks in results.multi_face_landmarks: for idx, lm in enumerate(face_landmarks.landmark): if idx == 33 or idx == 263 or idx == 1 or idx == 61 or idx == 291 or idx == 199: if idx == 1: nose_2d = (lm.x * img_w, lm.y * img_h) nose_3d = (lm.x * img_w, lm.y * img_h, lm.z * 3000) x, y = int(lm.x * img_w), int(lm.y * img_h) face_2d.append([x, y]) face_3d.append([x, y, lm.z]) face_2d = np.array(face_2d, dtype=np.float64) face_3d = np.array(face_3d, dtype=np.float64) focal_length = 1 * img_w cam_matrix = np.array([ [focal_length, 0, img_h / 2], [0, focal_length, img_w / 2], [0, 0, 1]]) dist_matrix = np.zeros((4, 1), dtype=np.float64) success, rot_vec, trans_vec = cv2.solvePnP(face_3d, face_2d, cam_matrix, dist_matrix) rmat, jac = cv2.Rodrigues(rot_vec) angles, mtxR, mtxQ, Qx, Qy, Qz = cv2.RQDecomp3x3(rmat) x = angles[0] * 360 y = angles[1] * 360 z = angles[2] * 360 if y < -7 or y > 7 or x < -7 or x > 7: flag = True else: flag = False if flag == True: distract_count += 1 image2 = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) results2 = holistic.process(image2) pose = results2.pose_landmarks.landmark pose_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in pose]).flatten()) face = results2.face_landmarks.landmark face_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in face]).flatten()) row = pose_row+face_row X = pd.DataFrame([row]) body_language_class = body_lang_model.predict(X)[0] body_language_prob = body_lang_model.predict_proba(X)[0] output_dict = {} for class_name, prob in zip(body_lang_model.classes_, body_language_prob): output_dict[class_name] = prob output_list.append(output_dict) pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) response = pipe1(pil_image) temp = {} for ele in response: label, score = ele.values() temp[label] = score result.append(temp) distraction_rate = distract_count/total_count total_bad_prob = 0 total_good_prob = 0 for output_dict in output_list: total_bad_prob += output_dict['Bad'] total_good_prob += output_dict['Good'] num_frames = len(output_list) avg_bad_prob = total_bad_prob / num_frames avg_good_prob = total_good_prob / num_frames final_output = {'Bad': avg_bad_prob, 'Good': avg_good_prob} cap.release() video_emotion_totals = {} emotion_totals = { 'admiration': 0.0, 'amusement': 0.0, 'angry': 0.0, 'annoyance': 0.0, 'approval': 0.0, 'caring': 0.0, 'confusion': 0.0, 'curiosity': 0.0, 'desire': 0.0, 'disappointment': 0.0, 'disapproval': 0.0, 'disgust': 0.0, 'embarrassment': 0.0, 'excitement': 0.0, 'fear': 0.0, 'gratitude': 0.0, 'grief': 0.0, 'happy': 0.0, 'love': 0.0, 'nervousness': 0.0, 'optimism': 0.0, 'pride': 0.0, 'realization': 0.0, 'relief': 0.0, 'remorse': 0.0, 'sad': 0.0, 'surprise': 0.0, 'neutral': 0.0 } counter = 0 for ele in result: for emotion in ele.keys(): emotion_totals[emotion] += ele.get(emotion) counter += 1 for emotion in emotion_totals: emotion_totals[emotion] /= counter if (emotion_totals[emotion]) > 0.0: video_emotion_totals[emotion] = emotion_totals[emotion] return video_emotion_totals, result, final_output, distraction_rate def analyze_sentiment(text): response = pipe2(text) sentiment_results = {} for ele in response: label, score = ele.values() sentiment_results[label] = score return sentiment_results def video_to_audio(input_video): video_emotion_totals, frames_sentiments, body_language, distraction_rate = extract_frames(input_video) print("Total Video Emotions ... Done") print("Video Frame Sentiment ... Done") print("Body Language ... Done") print("Distraction Rate ... Done") cap = cv2.VideoCapture(input_video) fps = int(cap.get(cv2.CAP_PROP_FPS)) audio = AudioSegment.from_file(input_video) audio_binary = audio.export(format="wav").read() audio_bytesio = BytesIO(audio_binary) audio_bytesio2 = BytesIO(audio_binary) response = requests.post(AUDIO_API_URL, headers=headers, data=audio_bytesio) formatted_response = {} for ele in response.json(): score, label = ele.values() formatted_response[label] = score print("Speech Sentiments ... Done") segments, info = model.transcribe(audio_bytesio2, beam_size=5) transcript = '' video_sentiment_final = [] final_output = [] for segment in segments: transcript = transcript + segment.text + " " transcript_segment_sentiment = analyze_sentiment(segment.text) emotion_totals = { 'admiration': 0.0, 'amusement': 0.0, 'angry': 0.0, 'annoyance': 0.0, 'approval': 0.0, 'caring': 0.0, 'confusion': 0.0, 'curiosity': 0.0, 'desire': 0.0, 'disappointment': 0.0, 'disapproval': 0.0, 'disgust': 0.0, 'embarrassment': 0.0, 'excitement': 0.0, 'fear': 0.0, 'gratitude': 0.0, 'grief': 0.0, 'happy': 0.0, 'love': 0.0, 'nervousness': 0.0, 'optimism': 0.0, 'pride': 0.0, 'realization': 0.0, 'relief': 0.0, 'remorse': 0.0, 'sad': 0.0, 'surprise': 0.0, 'neutral': 0.0 } counter = 0 for i in range(math.ceil(segment.start), math.floor(segment.end)): for emotion in frames_sentiments[i].keys(): emotion_totals[emotion] += frames_sentiments[i].get(emotion) counter += 1 for emotion in emotion_totals: emotion_totals[emotion] /= counter video_sentiment_final.append(emotion_totals) video_segment_sentiment = {key: value for key, value in emotion_totals.items() if value != 0.0} segment_finals = {segment.id: (segment.text, segment.start, segment.end, transcript_segment_sentiment, video_segment_sentiment)} final_output.append(segment_finals) total_transcript_sentiment = {key: value for key, value in analyze_sentiment(transcript).items() if value >= 0.01} print("Full Transcript Sentiments ... Done") emotion_finals = { 'admiration': 0.0, 'amusement': 0.0, 'angry': 0.0, 'annoyance': 0.0, 'approval': 0.0, 'caring': 0.0, 'confusion': 0.0, 'curiosity': 0.0, 'desire': 0.0, 'disappointment': 0.0, 'disapproval': 0.0, 'disgust': 0.0, 'embarrassment': 0.0, 'excitement': 0.0, 'fear': 0.0, 'gratitude': 0.0, 'grief': 0.0, 'happy': 0.0, 'love': 0.0, 'nervousness': 0.0, 'optimism': 0.0, 'pride': 0.0, 'realization': 0.0, 'relief': 0.0, 'remorse': 0.0, 'sad': 0.0, 'surprise': 0.0, 'neutral': 0.0 } for i in range(0, video_sentiment_final.__len__()-1): for emotion in video_sentiment_final[i].keys(): emotion_finals[emotion] += video_sentiment_final[i].get(emotion) for emotion in emotion_finals: emotion_finals[emotion] /= video_sentiment_final.__len__() emotion_finals = {key: value for key, value in emotion_finals.items() if value != 0.0} print("Video Frame (Mapping & AVG.) ... Done") print("\nProcessing Completed!!\n") payload = { 'from': 'gradio', 'total_video_emotions': video_emotion_totals, 'emotions_final': emotion_finals, 'body_language': body_language, 'distraction_rate': distraction_rate, 'formatted_response': formatted_response, 'total_transcript_sentiment': total_transcript_sentiment } print(payload) response = requests.post('http://127.0.0.1:5000/interview', json=payload) with gr.Blocks(theme=theme, css=".gradio-container { background: rgba(255, 255, 255, 0.2) !important; box-shadow: 0 8px 32px 0 rgba( 31, 38, 135, 0.37 ) !important; backdrop-filter: blur( 10px ) !important; -webkit-backdrop-filter: blur( 10px ) !important; border-radius: 10px !important; border: 1px solid rgba( 0, 0, 0, 0.5 ) !important;}") as Video: input_video = gr.Video(sources=["upload", "webcam"], format='mp4') input_video.stop_recording(fn=video_to_audio, inputs=input_video) Video.launch()