pseudo_video_gen / evaluation.py
qqwjq1981's picture
Upload 8 files
a5fc5ac verified
import json
import numpy as np
import cv2
from dotenv import load_dotenv
from openai import OpenAI
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from skimage.metrics import structural_similarity as ssim
# Load env for OpenAI
load_dotenv()
client = OpenAI()
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def evaluate_with_gpt4(storyboard, video_description):
system_prompt = (
"You are a film critic evaluating how well a video matches a storyboard.\n"
"Rate each of the following from 1 to 10:\n"
"- Story Consistency: Does the video follow the scene and emotion described?\n"
"- Shot Variety: Does it use interesting or varied camera angles?\n"
"- Relevance: Does it suit the intended purpose (role, setting, emotion)?\n\n"
"Provide scores and brief justifications for each.\n\n"
"Format output as:\n"
"{\n"
" \"story_consistency\": <score>,\n"
" \"shot_variety\": <score>,\n"
" \"relevance\": <score>,\n"
" \"justification\": \"...\"\n"
"}"
)
user_prompt = (
f"Storyboard:\n"
f"Scene: {storyboard['scene']}\n"
f"Shot: {storyboard['shot_type']}\n"
f"Emotion: {storyboard['emotion']}\n\n"
f"Video Description:\n{video_description}"
)
response = client.chat.completions.create(
model="gpt-4o",
temperature=0.3,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
content = response.choices[0].message.content.strip()
return json.loads(content)
def compute_clip_similarity(image_path, text_prompt):
image = Image.open(image_path).convert("RGB")
inputs = clip_processor(text=[text_prompt], images=image, return_tensors="pt", padding=True)
outputs = clip_model(**inputs)
logits_per_image = outputs.logits_per_image
similarity = logits_per_image.softmax(dim=1).item()
return similarity
def compute_motion_score(video_path):
cap = cv2.VideoCapture(video_path)
prev_gray = None
motion_values = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
if prev_gray is not None:
flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None,
0.5, 3, 15, 3, 5, 1.2, 0)
magnitude, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])
motion_values.append(np.mean(magnitude))
prev_gray = gray
cap.release()
return np.mean(motion_values) if motion_values else 0
def compute_temporal_coherence(video_path):
cap = cv2.VideoCapture(video_path)
prev_frame = None
ssim_scores = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
if prev_frame is not None:
score = ssim(prev_frame, gray)
ssim_scores.append(score)
prev_frame = gray
cap.release()
return np.mean(ssim_scores) if ssim_scores else 0
def evaluate_video(storyboard, video_description, video_path, thumbnail_path, text_prompt):
gpt_eval = evaluate_with_gpt4(storyboard, video_description)
clip_score = compute_clip_similarity(thumbnail_path, text_prompt)
motion_score = compute_motion_score(video_path)
coherence_score = compute_temporal_coherence(video_path)
return {
"gpt_eval": gpt_eval,
"metrics": {
"clip_similarity": clip_score,
"motion_score": motion_score,
"temporal_coherence": coherence_score
}
}