Spaces:
Runtime error
Runtime error
import json | |
import numpy as np | |
import cv2 | |
from dotenv import load_dotenv | |
from openai import OpenAI | |
from transformers import CLIPProcessor, CLIPModel | |
from PIL import Image | |
from skimage.metrics import structural_similarity as ssim | |
# Load env for OpenAI | |
load_dotenv() | |
client = OpenAI() | |
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") | |
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
def evaluate_with_gpt4(storyboard, video_description): | |
system_prompt = ( | |
"You are a film critic evaluating how well a video matches a storyboard.\n" | |
"Rate each of the following from 1 to 10:\n" | |
"- Story Consistency: Does the video follow the scene and emotion described?\n" | |
"- Shot Variety: Does it use interesting or varied camera angles?\n" | |
"- Relevance: Does it suit the intended purpose (role, setting, emotion)?\n\n" | |
"Provide scores and brief justifications for each.\n\n" | |
"Format output as:\n" | |
"{\n" | |
" \"story_consistency\": <score>,\n" | |
" \"shot_variety\": <score>,\n" | |
" \"relevance\": <score>,\n" | |
" \"justification\": \"...\"\n" | |
"}" | |
) | |
user_prompt = ( | |
f"Storyboard:\n" | |
f"Scene: {storyboard['scene']}\n" | |
f"Shot: {storyboard['shot_type']}\n" | |
f"Emotion: {storyboard['emotion']}\n\n" | |
f"Video Description:\n{video_description}" | |
) | |
response = client.chat.completions.create( | |
model="gpt-4o", | |
temperature=0.3, | |
messages=[ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": user_prompt} | |
] | |
) | |
content = response.choices[0].message.content.strip() | |
return json.loads(content) | |
def compute_clip_similarity(image_path, text_prompt): | |
image = Image.open(image_path).convert("RGB") | |
inputs = clip_processor(text=[text_prompt], images=image, return_tensors="pt", padding=True) | |
outputs = clip_model(**inputs) | |
logits_per_image = outputs.logits_per_image | |
similarity = logits_per_image.softmax(dim=1).item() | |
return similarity | |
def compute_motion_score(video_path): | |
cap = cv2.VideoCapture(video_path) | |
prev_gray = None | |
motion_values = [] | |
while cap.isOpened(): | |
ret, frame = cap.read() | |
if not ret: | |
break | |
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) | |
if prev_gray is not None: | |
flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None, | |
0.5, 3, 15, 3, 5, 1.2, 0) | |
magnitude, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1]) | |
motion_values.append(np.mean(magnitude)) | |
prev_gray = gray | |
cap.release() | |
return np.mean(motion_values) if motion_values else 0 | |
def compute_temporal_coherence(video_path): | |
cap = cv2.VideoCapture(video_path) | |
prev_frame = None | |
ssim_scores = [] | |
while cap.isOpened(): | |
ret, frame = cap.read() | |
if not ret: | |
break | |
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) | |
if prev_frame is not None: | |
score = ssim(prev_frame, gray) | |
ssim_scores.append(score) | |
prev_frame = gray | |
cap.release() | |
return np.mean(ssim_scores) if ssim_scores else 0 | |
def evaluate_video(storyboard, video_description, video_path, thumbnail_path, text_prompt): | |
gpt_eval = evaluate_with_gpt4(storyboard, video_description) | |
clip_score = compute_clip_similarity(thumbnail_path, text_prompt) | |
motion_score = compute_motion_score(video_path) | |
coherence_score = compute_temporal_coherence(video_path) | |
return { | |
"gpt_eval": gpt_eval, | |
"metrics": { | |
"clip_similarity": clip_score, | |
"motion_score": motion_score, | |
"temporal_coherence": coherence_score | |
} | |
} | |