File size: 3,086 Bytes
8ced839
 
 
 
 
 
 
 
 
 
 
 
6d1ef96
8ced839
 
 
 
 
 
 
6d1ef96
 
 
 
 
 
 
 
8ced839
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import base64
import cv2
import face_recognition
import gradio as gr
import moviepy.editor as mp
import os
import time
import torchaudio
from fastai.vision.all import load_learner
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline

emotion_pipeline = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion")
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

model = load_learner("gaze-recognizer-v3.pkl")

def extract_audio(video_path):
    clip = mp.VideoFileClip(video_path)
    clip.audio.write_audiofile("audio.wav")

def analyze_emotion(text):
    result = emotion_pipeline(text)
    return result

def analyze_sentiment(text):
    result = sentiment_pipeline(text)
    return result

def get_transcription(path):
    extract_audio(path)

    waveform, sample_rate = torchaudio.load("audio.wav")
    resampler = torchaudio.transforms.Resample(sample_rate, 16000)
    waveform = resampler(waveform)[0]

    processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
    model.config.forced_decoder_ids = None

    input_features = processor(waveform.squeeze(dim=0), return_tensors="pt").input_features 
    predicted_ids = model.generate(input_features)

    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def process_frame(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    face_locations = face_recognition.face_locations(gray)

    if len(face_locations) > 0:
        for top, right, bottom, left in face_locations:
            face_image = gray[top:bottom, left:right]
            resized_face_image = cv2.resize(face_image, (128, 128))
            result = model.predict(resized_face_image)

            return result[0]

    return None

def video_processing(video_file, encoded_video):
    if encoded_video != "":
        decoded_file_data = base64.b64decode(encoded_video)
        with open("temp_video.mp4", "wb") as f:
            f.write(decoded_file_data)
        video_file = "temp_video.mp4"

    transcription = get_transcription(video_file)
    print(transcription)

    video_capture = cv2.VideoCapture(video_file)
    on_camera = 0
    off_camera = 0
    total = 0
    emotions = []

    while True:
        for _ in range(24 * 3):
            ret, frame = video_capture.read()
            if not ret:
                break

        if not ret:
            break

        result = process_frame(frame)
        if result:
            if result == 'on_camera':
                on_camera += 1
            elif result == 'off_camera':
                off_camera += 1
            total += 1

        emotion_results = analyze_emotion(transcription)
        emotions.append(emotion_results)

    video_capture.release()
    cv2.destroyAllWindows()

    if os.path.exists("temp_video.mp4"):
        os.remove("temp_video.mp4")

    gaze_percentage = on_camera / total * 100 if total > 0