VideoLLaMA2-7B / videollama2 /eval /eval_video_mcqa_videomme.py
Aliayub1995's picture
Upload 52 files
87ce8f2 verified
import os
import re
import json
import argparse
from typing import List, Dict, Optional, Union
CATEGORIES = [
"Knowledge",
"Film & Television",
"Sports Competition",
"Artistic Performance",
"Life Record",
"Multilingual"
]
SUB_CATEGORIES = [
"Humanity & History",
"Literature & Art",
"Biology & Medicine",
"Finance & Commerce",
"Astronomy",
"Geography",
"Law",
"Life Tip",
"Technology",
"Animation",
"Movie & TV Show",
"Documentary",
"News Report",
"Esports",
"Basketball",
"Football",
"Athletics",
"Other Sports",
"Stage Play",
"Magic Show",
"Variety Show",
"Acrobatics",
"Handicraft",
"Food",
"Fashion",
"Daily Life",
"Travel",
"Pet & Animal",
"Exercise",
"Multilingual"
]
TASK_CATEGORIES = [
"Temporal Perception",
"Spatial Perception",
"Attribute Perception",
"Action Recognition",
"Object Recognition",
"OCR Problems",
"Counting Problem",
"Temporal Reasoning",
"Spatial Reasoning",
"Action Reasoning",
"Object Reasoning",
"Information Synopsis",
]
def extract_characters_regex(s):
s = s.strip()
answer_prefixes = [
"The best answer is",
"The correct answer is",
"The answer is",
"The answer",
"The best option is"
"The correct option is",
"Best answer:"
"Best option:",
]
for answer_prefix in answer_prefixes:
s = s.replace(answer_prefix, "")
if len(s.split()) > 10 and not re.search("[ABCD]", s):
return ""
matches = re.search(r'[ABCD]', s)
if matches is None:
return ""
return matches[0]
def eval_your_results(
your_results_path: str,
video_types: Optional[Union[List[str], str]] = None,
skip_missing: Optional[bool] = True,
return_categories_accuracy: Optional[bool] = True,
return_sub_categories_accuracy: Optional[bool] = False,
return_task_types_accuracy: Optional[bool] = False,
gt_answer_key: Optional[str] = "answer",
your_answer_key: Optional[str] = "response"
):
"""
Evaluate your results against the ground truth
Args:
- your_results_path (str): Path to your results file
- video_types (Optional[List[str], str]): List of video types to evaluate.
- skip_missing (Optional[bool]): If True, missing files will be skipped. If False, an error will be raised if there are missing files.
- return_categories_accuracy (Optional[bool]): If True, the accuracy for each video category will be returned.
- return_sub_categories_accuracy (Optional[bool]): If True, the accuracy for each video sub category will be returned.
- return_task_types_accuracy (Optional[bool]): If True, the accuracy for each task category will be returned.
- gt_answer_key (Optional[str]): Key to access the ground truth answer in the results file.
- your_answer_key (Optional[str]): Key to access your answer in the results file.
"""
# Load your results
with open(your_results_path, 'r') as f:
your_results = json.load(f)
if isinstance(video_types, str):
video_types = video_types.split(",")
q_type_dict = {}
v_type_dict = {}
v_sub_type_dict = {}
for video_type in video_types:
# Filter your results based on video types
your_results_video_type = [item for item in your_results if item["duration"] == video_type]
# Task Categories
q_type_dict[video_type] = {}
for q_type in TASK_CATEGORIES:
q_type_dict[video_type][q_type] = {"correct": 0, "answered": 0}
# Video categories
v_type_dict[video_type] = {}
for v_type in CATEGORIES:
v_type_dict[video_type][v_type] = {"correct": 0, "answered": 0}
v_sub_type_dict[video_type] = {}
for v_sub_type in SUB_CATEGORIES:
v_sub_type_dict[video_type][v_sub_type] = {"correct": 0, "answered": 0}
if not skip_missing:
# Check if the number of files in your results and ground truth are the same
assert len(your_results_video_type) == 300, f"Number of files in {video_type} is not 300. Check if there are missing files."
for item in your_results_video_type:
if skip_missing and item["missing"]:
continue
# Get the video category, sub category and question category
video_category = item["domain"]
video_sub_category = item["sub_category"]
questions = item["questions"]
for question in questions:
q_type = question["task_type"]
# Get the ground truth and your response
gt_answer = question[gt_answer_key]
response = question[your_answer_key]
# Extract the answer from the response
extration = extract_characters_regex(response)
if extration != "":
q_type_dict[video_type][q_type]["answered"] += 1
q_type_dict[video_type][q_type]["correct"] += extration == gt_answer
v_type_dict[video_type][video_category]["answered"] += 1
v_type_dict[video_type][video_category]["correct"] += extration == gt_answer
v_sub_type_dict[video_type][video_sub_category]["answered"] += 1
v_sub_type_dict[video_type][video_sub_category]["correct"] += extration == gt_answer
# Print the results for each video type
for video_type in video_types:
print("=====================================")
print(f"Evaluation on video Type: {video_type}")
print("=====================================")
if return_categories_accuracy:
print("-------------------------------------")
print("Video Domains")
print("-------------------------------------")
for v_type in v_type_dict[video_type]:
print(f"{v_type}: {100 * v_type_dict[video_type][v_type]['correct'] / v_type_dict[video_type][v_type]['answered'] if v_type_dict[video_type][v_type]['answered'] > 0 else 0 : .1f}%")
if return_sub_categories_accuracy:
print("-------------------------------------")
print("Video Sub Categories")
print("-------------------------------------")
for v_sub_type in v_sub_type_dict[video_type]:
print(f"{v_sub_type}: {100 * v_sub_type_dict[video_type][v_sub_type]['correct'] / v_sub_type_dict[video_type][v_sub_type]['answered'] if v_sub_type_dict[video_type][v_sub_type]['answered'] > 0 else 0 : .1f}%")
if return_task_types_accuracy:
print("-------------------------------------")
print("Task Categories")
print("-------------------------------------")
for q_type in q_type_dict[video_type]:
print(f"{q_type}: {100 * q_type_dict[video_type][q_type]['correct'] / q_type_dict[video_type][q_type]['answered'] if q_type_dict[video_type][q_type]['answered'] > 0 else 0 : .1f}%")
print("-------------------------------------")
print("Overall Performance")
print("-------------------------------------")
total_correct = sum([q_type_dict[video_type][q_type]["correct"] for q_type in TASK_CATEGORIES])
total_answered = sum([q_type_dict[video_type][q_type]["answered"] for q_type in TASK_CATEGORIES])
print(f"Overall: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
print("\n")
# Print the results for the entire dataset
print("=====================================")
print("Evaluation on the entire dataset")
print("=====================================")
if return_categories_accuracy:
print("-------------------------------------")
print("Video Categories")
print("-------------------------------------")
for v_type in CATEGORIES:
total_correct = sum([v_type_dict[video_type][v_type]["correct"] for video_type in video_types])
total_answered = sum([v_type_dict[video_type][v_type]["answered"] for video_type in video_types])
print(f"{v_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
if return_sub_categories_accuracy:
print("-------------------------------------")
print("Video Sub Categories")
print("-------------------------------------")
for v_sub_type in SUB_CATEGORIES:
total_correct = sum([v_sub_type_dict[video_type][v_sub_type]["correct"] for video_type in video_types])
total_answered = sum([v_sub_type_dict[video_type][v_sub_type]["answered"] for video_type in video_types])
print(f"{v_sub_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
if return_task_types_accuracy:
print("-------------------------------------")
print("Task Categories")
print("-------------------------------------")
for q_type in TASK_CATEGORIES:
total_correct = sum([q_type_dict[video_type][q_type]["correct"] for video_type in video_types])
total_answered = sum([q_type_dict[video_type][q_type]["answered"] for video_type in video_types])
print(f"{q_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
print("-------------------------------------")
print("Overall Performance")
print("-------------------------------------")
total_correct = sum([sum([q_type_dict[video_type][q_type]["correct"] for q_type in TASK_CATEGORIES]) for video_type in video_types])
total_answered = sum([sum([q_type_dict[video_type][q_type]["answered"] for q_type in TASK_CATEGORIES]) for video_type in video_types])
print(f"Overall: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--results_file", type=str, required=True)
parser.add_argument("--video_duration_type", type=str, required=True)
parser.add_argument("--return_categories_accuracy", action="store_true")
parser.add_argument("--return_sub_categories_accuracy", action="store_true")
parser.add_argument("--return_task_types_accuracy", action="store_true")
parser.add_argument("--skip_missing", action="store_true")
args = parser.parse_args()
eval_your_results(
args.results_file,
video_types=args.video_duration_type,
skip_missing=args.skip_missing,
return_categories_accuracy=args.return_categories_accuracy,
return_sub_categories_accuracy=args.return_sub_categories_accuracy,
return_task_types_accuracy=args.return_task_types_accuracy,
)