import base64 import cv2 import face_recognition import gradio as gr import moviepy.editor as mp import os import time import torchaudio from fastai.vision.all import load_learner from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline emotion_pipeline = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion") sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") model = load_learner("gaze-recognizer-v3.pkl") def extract_audio(video_path): clip = mp.VideoFileClip(video_path) clip.audio.write_audiofile("audio.wav") def analyze_emotion(text): result = emotion_pipeline(text) return result def analyze_sentiment(text): result = sentiment_pipeline(text) return result def get_transcription(path): extract_audio(path) waveform, sample_rate = torchaudio.load("audio.wav") resampler = torchaudio.transforms.Resample(sample_rate, 16000) waveform = resampler(waveform)[0] processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") model.config.forced_decoder_ids = None input_features = processor(waveform.squeeze(dim=0), return_tensors="pt").input_features predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) return transcription[0] def process_frame(frame): gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) face_locations = face_recognition.face_locations(gray) if len(face_locations) > 0: for top, right, bottom, left in face_locations: face_image = gray[top:bottom, left:right] resized_face_image = cv2.resize(face_image, (128, 128)) result = model.predict(resized_face_image) return result[0] return None def video_processing(video_file, encoded_video): if encoded_video != "": decoded_file_data = base64.b64decode(encoded_video) with open("temp_video.mp4", "wb") as f: f.write(decoded_file_data) video_file = "temp_video.mp4" transcription = get_transcription(video_file) print(transcription) video_capture = cv2.VideoCapture(video_file) on_camera = 0 off_camera = 0 total = 0 emotions = [] while True: for _ in range(24 * 3): ret, frame = video_capture.read() if not ret: break if not ret: break result = process_frame(frame) if result: if result == 'on_camera': on_camera += 1 elif result == 'off_camera': off_camera += 1 total += 1 emotion_results = analyze_emotion(transcription) emotions.append(emotion_results) video_capture.release() cv2.destroyAllWindows() if os.path.exists("temp_video.mp4"): os.remove("temp_video.mp4") gaze_percentage = on_camera / total * 100 if total > 0