Spaces:
Running
Running
import gradio as gr | |
import os | |
import cv2 | |
import face_recognition | |
from fastai.vision.all import load_learner | |
import time | |
import base64 | |
from deepface import DeepFace | |
import torchaudio | |
import moviepy.editor as mp | |
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline | |
# import pathlib | |
# temp = pathlib.PosixPath | |
# pathlib.PosixPath = pathlib.WindowsPath | |
backends = [ | |
'opencv', | |
'ssd', | |
'dlib', | |
'mtcnn', | |
'retinaface', | |
'mediapipe' | |
] | |
emotion_pipeline = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion") | |
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") | |
model = load_learner("gaze-recognizer-v3.pkl") | |
def analyze_emotion(text): | |
result = emotion_pipeline(text) | |
return result | |
def analyze_sentiment(text): | |
result = sentiment_pipeline(text) | |
return result | |
def getTranscription(path): | |
# Insert Local Video File Path | |
clip = mp.VideoFileClip(path) | |
# Insert Local Audio File Path | |
clip.audio.write_audiofile(r"audio.wav") | |
waveform, sample_rate = torchaudio.load("audio.wav") | |
waveform, sample_rate | |
waveform, sample_rate = torchaudio.load("audio.wav") | |
resampler = torchaudio.transforms.Resample(sample_rate, 16000) | |
waveform = resampler(waveform)[0] | |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") | |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") | |
model.config.forced_decoder_ids = None | |
input_features = processor(waveform.squeeze(dim=0), return_tensors="pt").input_features | |
predicted_ids = model.generate(input_features) | |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) | |
return transcription[0] | |
def video_processing(video_file, encoded_video): | |
angry = 0 | |
disgust = 0 | |
fear = 0 | |
happy = 0 | |
sad = 0 | |
surprise = 0 | |
neutral = 0 | |
emotion_count = 0 | |
if encoded_video != "": | |
decoded_file_data = base64.b64decode(encoded_video) | |
with open("temp_video.mp4", "wb") as f: | |
f.write(decoded_file_data) | |
video_file = "temp_video.mp4" | |
start_time = time.time() | |
transcription = getTranscription(video_file) | |
print(transcription) | |
text_emotion = analyze_emotion(transcription) | |
print(text_emotion) | |
text_sentiment = analyze_sentiment(transcription) | |
print(text_sentiment) | |
video_capture = cv2.VideoCapture(video_file) | |
on_camera = 0 | |
off_camera = 0 | |
total = 0 | |
while True: | |
# Read a single frame from the video | |
for i in range(24*3): | |
ret, frame = video_capture.read() | |
if not ret: | |
break | |
# If there are no more frames, break out of the loop | |
if not ret: | |
break | |
# Convert the frame to RGB color (face_recognition uses RGB) | |
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) | |
# Find all the faces in the frame using a pre-trained convolutional neural network. | |
face_locations = face_recognition.face_locations(gray) | |
#face_locations = face_recognition.face_locations(gray, number_of_times_to_upsample=0, model="cnn") | |
if len(face_locations) > 0: | |
# Show the original frame with face rectangles drawn around the faces | |
for top, right, bottom, left in face_locations: | |
# cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 2) | |
face_image = gray[top:bottom, left:right] | |
color_image = frame[top:bottom, left:right] | |
# Resize the face image to the desired size | |
resized_face_image = cv2.resize(face_image, (128,128)) | |
try: | |
emotion = DeepFace.analyze(color_image,actions=['emotion'],detector_backend = backends[2],enforce_detection = False)# 2,3, 4 works | |
emotion_count += 1 | |
except Exception as e: | |
pass | |
print(emotion[0]['emotion']) | |
angry += emotion[0]['emotion']['angry'] | |
disgust += emotion[0]['emotion']['disgust'] | |
fear += emotion[0]['emotion']['fear'] | |
happy += emotion[0]['emotion']['happy'] | |
sad += emotion[0]['emotion']['sad'] | |
surprise += emotion[0]['emotion']['surprise'] | |
neutral += emotion[0]['emotion']['neutral'] | |
# Predict the class of the resized face image using the model | |
result = model.predict(resized_face_image) | |
print(result[0]) | |
if(result[0] == 'on_camera'): on_camera = on_camera + 1 | |
elif(result[0] == 'off_camera'): off_camera = off_camera + 1 | |
total = total + 1 | |
try: | |
# your processing code here | |
gaze_percentage = on_camera / total * 100 | |
except Exception as e: | |
print(f"An error occurred while processing the video: {e}") | |
gaze_percentage = f'no face detected Total = {total},on_camera = {on_camera},off_camera = {off_camera}' | |
print(f'Total = {total},on_camera = {on_camera},off_camera = {off_camera}') | |
# print(f'focus perfectage = {on_camera/total*100}') | |
# Release the video capture object and close all windows | |
video_capture.release() | |
cv2.destroyAllWindows() | |
end_time = time.time() | |
print(f'Time taken: {end_time-start_time}') | |
if os.path.exists("temp_video.mp4"): | |
os.remove("temp_video.mp4") | |
print(gaze_percentage) | |
angry = angry / emotion_count | |
disgust = disgust / emotion_count | |
fear = fear / emotion_count | |
happy = happy / emotion_count | |
sad = sad / emotion_count | |
surprise = surprise / emotion_count | |
neutral = neutral / emotion_count | |
emotion = { | |
'angry': angry, | |
'disgust': disgust, | |
'fear': fear, | |
'happy': happy, | |
'sad': sad, | |
'surprise': surprise, | |
'neutral': neutral | |
}, | |
# angry = 'total anger percentage' + str(angry) | |
# disgust = 'total disgust percentage' + str(disgust) | |
# fear = 'total fear percentage' + str(fear) | |
# happy = 'total happy percentage' + str(happy) | |
# sad = 'total sad percentage' + str(sad) | |
# surprise = 'total surprise percentage' + str(surprise) | |
# neutral = 'total neutral percentage' + str(neutral) | |
print(f'total anger percentage = {angry}') | |
print(f'total disgust percentage = {disgust}') | |
print(f'total fear percentage = {fear}') | |
print(f'total happy percentage = {happy}') | |
print(f'total sad percentage = {sad}') | |
print(f'total surprise percentage = {surprise}') | |
print(f'total neutral percentage = {neutral}') | |
final_result = "Gaze = "+str(gaze_percentage)+"Text Emotion"+str(text_emotion)+"Text transcription"+str(transcription)+"Text sentiment"+str(text_sentiment) | |
return final_result | |
demo = gr.Interface(fn=video_processing, | |
inputs=["video", "text"], | |
outputs="text") | |
if __name__ == "__main__": | |
demo.launch() |