|
import copy |
|
import datetime |
|
import json |
|
import os |
|
from email.utils import parseaddr |
|
import re |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
from datasets import Dataset, DatasetDict, VerificationMode, get_dataset_config_names, load_dataset |
|
from huggingface_hub import HfApi |
|
|
|
from content import ( |
|
CITATION_BUTTON_LABEL, |
|
CITATION_BUTTON_TEXT, |
|
INTRODUCTION_TEXT, |
|
SUBMISSION_TEXT, |
|
TITLE, |
|
format_error, |
|
format_log, |
|
format_warning, |
|
model_hyperlink, |
|
) |
|
|
|
TOKEN = os.environ.get("HF_TOKEN", None) |
|
|
|
|
|
OWNER = "facebook" |
|
|
|
SUBMISSION_DATASET = f"{OWNER}/pwm_leaderboard_submissions_internal" |
|
CONTACT_DATASET = f"{OWNER}/pwm_leaderboard_contact_info_internal" |
|
|
|
RESULTS_DATASET = f"{OWNER}/pwm_leaderboard_results_public" |
|
LEADERBOARD_PATH = f"{OWNER}/pwm_leaderboard" |
|
DATA_VERSION = "1.0.0" |
|
|
|
|
|
MVP_DATASET = "facebook/minimal_video_pairs" |
|
INTP_DATASET = "facebook/IntPhys2_test" |
|
WMQA_DATASET = "facebook/CausalVQA" |
|
|
|
|
|
MVP_NAME = "MVPBench" |
|
INTP_NAME = "IntPhys 2" |
|
WMQA_NAME = "CausalVQA" |
|
|
|
|
|
MVP_KEY = "mvp" |
|
MVP_MINI_KEY = "mvp_mini" |
|
INTP_KEY = "intphys2" |
|
WMQA_KEY = "causalvqa" |
|
|
|
TASKS = [ |
|
(INTP_KEY, INTP_NAME), |
|
(MVP_KEY, MVP_NAME), |
|
(WMQA_KEY, WMQA_NAME), |
|
] |
|
VISIBLE_TASKS = copy.deepcopy(TASKS) |
|
PRE_COL_NAMES = ["Model Name"] |
|
POST_COL_NAMES = ["Model Type", "Vision Backbone", "LLM Backbone", "Submission Date"] |
|
|
|
|
|
api = HfApi() |
|
|
|
os.makedirs("scored", exist_ok=True) |
|
|
|
LOCAL_DEBUG = False |
|
|
|
|
|
|
|
LDB_TEXT_KEYS = ["model", "model_type", "vision_backbone", "llm_backbone"] |
|
LDB_TEXT_TYPES = ["markdown", "text", "text", "text"] |
|
MISSING_VALUE = -1.0 |
|
|
|
HUMAN_BASELINES = { |
|
"url": "", |
|
"model": "Human", |
|
"model_type": "Human", |
|
"system_prompt": "test", |
|
"vision_backbone": " - ", |
|
"llm_backbone": " - ", |
|
"num_frames": -1, |
|
f"score_{INTP_KEY}": 92.44, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": 92.9, |
|
f"score_{WMQA_KEY}": 84.78, |
|
"date": "2025-06-11", |
|
"organization": "Meta", |
|
"submitted_by": "user", |
|
} |
|
|
|
|
|
GEMINI2_5 = { |
|
"url": "https://deepmind.google/models/gemini/flash/", |
|
"model": "Gemini 2.5 Flash", |
|
"model_type": "Closed", |
|
"system_prompt": "test", |
|
"vision_backbone": " - ", |
|
"llm_backbone": " - ", |
|
"num_frames": 10, |
|
f"score_{INTP_KEY}": 56.1, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": MISSING_VALUE, |
|
f"score_{WMQA_KEY}": 61.66, |
|
"date": "2025-06-11", |
|
"organization": "Meta", |
|
"submitted_by": "user", |
|
} |
|
|
|
GPT4O = { |
|
"url": "https://openai.com/index/gpt-4o-system-card/", |
|
"model": "GPT-4o", |
|
"model_type": "Closed", |
|
"system_prompt": "test", |
|
"vision_backbone": " - ", |
|
"llm_backbone": " - ", |
|
"num_frames": 10, |
|
f"score_{INTP_KEY}": 53.19, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": 32.5, |
|
f"score_{WMQA_KEY}": 50.95, |
|
"date": "2025-06-11", |
|
"organization": "Meta", |
|
"submitted_by": "user", |
|
} |
|
|
|
INTERN_VL = { |
|
"url": "https://internvl.github.io/blog/2024-12-05-InternVL-2.5/", |
|
"model": "InternVL2.5", |
|
"model_type": "Open", |
|
"system_prompt": "test", |
|
"vision_backbone": "InternViT-300M", |
|
"llm_backbone": "InternLM2.5-7B-Chat", |
|
"num_frames": 16, |
|
f"score_{INTP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": 39.9, |
|
f"score_{WMQA_KEY}": 47.54, |
|
"date": "2025-06-11", |
|
"organization": "Meta", |
|
"submitted_by": "user", |
|
} |
|
|
|
LLAVA = { |
|
"url": "https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov", |
|
"model": "LLaVA-OneVision", |
|
"model_type": "Open", |
|
"system_prompt": "test", |
|
"vision_backbone": "SigLIP", |
|
"llm_backbone": "Qwen2-7B", |
|
"num_frames": 16, |
|
f"score_{INTP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": 20.7, |
|
f"score_{WMQA_KEY}": 45.27, |
|
"date": "2025-06-11", |
|
"organization": "Meta", |
|
"submitted_by": "user", |
|
} |
|
|
|
PLM = { |
|
"url": "https://github.com/facebookresearch/perception_models", |
|
"model": "Perception Language Model (PLM)", |
|
"model_type": "Open", |
|
"system_prompt": "test", |
|
"vision_backbone": "PE", |
|
"llm_backbone": "Llama3.1 8B", |
|
"num_frames": 16, |
|
f"score_{INTP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": 39.7, |
|
f"score_{WMQA_KEY}": 50.06, |
|
"date": "2025-06-11", |
|
"organization": "Meta", |
|
"submitted_by": "user", |
|
} |
|
|
|
QWENVL = { |
|
"url": "https://github.com/QwenLM/Qwen2.5-VL", |
|
"model": "Qwen2.5-VL", |
|
"model_type": "Open", |
|
"system_prompt": "test", |
|
"vision_backbone": "ViT", |
|
"llm_backbone": "Qwen2.5-7B-Instruct", |
|
"num_frames": 16, |
|
f"score_{INTP_KEY}": 49.12, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": 36.7, |
|
f"score_{WMQA_KEY}": 49.05, |
|
"date": "2025-06-11", |
|
"organization": "Meta", |
|
"submitted_by": "user", |
|
} |
|
|
|
GEMINI1_5 = { |
|
"url": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/1-5-pro", |
|
"model": "Gemini 1.5 Pro", |
|
"model_type": "Closed", |
|
"system_prompt": "test", |
|
"vision_backbone": " - ", |
|
"llm_backbone": " - ", |
|
"num_frames": -1, |
|
f"score_{INTP_KEY}": 52.1, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": 29.6, |
|
f"score_{WMQA_KEY}": MISSING_VALUE, |
|
"date": "2025-06-11", |
|
"organization": "Meta", |
|
"submitted_by": "user", |
|
} |
|
|
|
VJEPA2 = { |
|
"url": "https://ai.meta.com/vjepa/", |
|
"model": "V-JEPA 2", |
|
"model_type": "Open", |
|
"system_prompt": "test", |
|
"vision_backbone": "VJEPA 2", |
|
"llm_backbone": "Llama3.1 8B", |
|
"num_frames": -1, |
|
f"score_{INTP_KEY}": 56.4, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": 44.5, |
|
f"score_{WMQA_KEY}": 38.99, |
|
"date": "2025-06-11", |
|
"organization": "Meta", |
|
"submitted_by": "user", |
|
} |
|
|
|
COSMOS = { |
|
"url": "https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-4B", |
|
"model": "Cosmos-4B", |
|
"model_type": "Open", |
|
"system_prompt": "test", |
|
"vision_backbone": " - ", |
|
"llm_backbone": " - ", |
|
"num_frames": -1, |
|
f"score_{INTP_KEY}": 48.84, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": MISSING_VALUE, |
|
f"score_{WMQA_KEY}": MISSING_VALUE, |
|
"date": "2025-06-11", |
|
"organization": "Meta", |
|
"submitted_by": "user", |
|
} |
|
|
|
|
|
def get_dataframe_from_results(eval_results, split): |
|
local_df = eval_results[split] |
|
local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])}) |
|
local_df = local_df.remove_columns(["system_prompt"]) |
|
|
|
df = pd.DataFrame(local_df) |
|
|
|
|
|
df["model_org"] = df["model"].str.cat(df["organization"], sep="-") |
|
ldb_m2r = {} |
|
for i, row in df.iterrows(): |
|
if row["model_org"] not in ldb_m2r: |
|
ldb_m2r[row["model_org"]] = {} |
|
|
|
prev_d = ldb_m2r[row["model_org"]] |
|
new_d = {} |
|
for key in LDB_TEXT_KEYS: |
|
new_d[key] = row[key] if len(row[key]) > 0 else prev_d.get(key, "NA") |
|
for tname, _ in TASKS: |
|
new_d[f"score_{tname}"] = ( |
|
row[f"score_{tname}"] if row[f"score_{tname}"] >= 0 else prev_d.get(f"score_{tname}", MISSING_VALUE) |
|
) |
|
if tname == "mvp": |
|
new_d[f"score_mvp_mini"] = ( |
|
row[f"score_mvp_mini"] |
|
if row[f"score_mvp_mini"] >= 0 |
|
else prev_d.get(f"score_mvp_mini", MISSING_VALUE) |
|
) |
|
new_d["date"] = row["date"] |
|
ldb_m2r[row["model_org"]] = new_d |
|
|
|
|
|
ldb_m2r["human"] = HUMAN_BASELINES |
|
ldb_m2r["gemini2.5"] = GEMINI2_5 |
|
ldb_m2r["gemini1.5"] = GEMINI1_5 |
|
ldb_m2r["gpt4o"] = GPT4O |
|
ldb_m2r["internvl"] = INTERN_VL |
|
ldb_m2r["llavaov"] = LLAVA |
|
ldb_m2r["plm"] = PLM |
|
ldb_m2r["qwen2.5"] = QWENVL |
|
ldb_m2r["vjepa2"] = VJEPA2 |
|
ldb_m2r["cosmos"] = COSMOS |
|
|
|
ldb_rows = [] |
|
for key, val in ldb_m2r.items(): |
|
print(ldb_m2r[key]) |
|
if "url" in ldb_m2r[key].keys() and ldb_m2r[key]["url"] != "": |
|
ldb_m2r[key]["model"] = model_hyperlink(ldb_m2r[key]["url"],ldb_m2r[key]["model"]) |
|
row = copy.deepcopy(val) |
|
score_keys = {k for k in val if k.startswith("score_")} |
|
row["score"] = np.round(np.mean([row[sk] for sk in score_keys if (row[sk] != MISSING_VALUE and row[sk] != "-")]), 2) |
|
tasks_completed = 0 |
|
for sk in score_keys: |
|
if row[sk] == MISSING_VALUE: |
|
row[sk] = "-" |
|
else: |
|
tasks_completed += 1 |
|
row["tasks_completed"] = tasks_completed |
|
ldb_rows.append(row) |
|
|
|
df = pd.DataFrame(ldb_rows) |
|
df = df.query('date >= "2025-06-11"') |
|
|
|
|
|
|
|
df = df.sort_values(by=["tasks_completed", "score"], ascending=False) |
|
|
|
|
|
numeric_cols = [c for c in df.columns if c.startswith("score_")] |
|
for nc in numeric_cols: |
|
df[nc] = df[nc].apply(lambda x: np.round(x, 2) if type(x) == float else x) |
|
|
|
|
|
df.drop(["tasks_completed"], axis=1, inplace=True) |
|
col_mapper = {f"score_{tname}": f"{tdisplay} (%)" for tname, tdisplay in TASKS if tname != "mvp"} |
|
col_mapper.update( |
|
{ |
|
"model": "Model Name", |
|
"model_type": "Model Type", |
|
"vision_backbone": "Vision Backbone", |
|
"llm_backbone": "LLM Backbone", |
|
|
|
"date": "Submission Date", |
|
} |
|
) |
|
df.rename(col_mapper, axis=1, inplace=True) |
|
|
|
df[f"{MVP_NAME} (%)"] = df.score_mvp_mini.astype(str) |
|
df.drop([f"score_{MVP_KEY}", f"score_{MVP_MINI_KEY}"], axis=1, inplace=True) |
|
|
|
df = df[PRE_COL_NAMES + [f"{t[1]} (%)" for t in VISIBLE_TASKS] + POST_COL_NAMES] |
|
|
|
return df |
|
|
|
|
|
def create_dummy_data(): |
|
|
|
rows = [ |
|
{ |
|
"url": "https://deepmind.google/models/gemini/flash/", |
|
"model": "Gemini Test", |
|
"model_type": "Closed", |
|
"system_prompt": "test", |
|
"vision_backbone": " - ", |
|
"llm_backbone": " - ", |
|
"num_frames": 10, |
|
f"score_{INTP_KEY}": 56.1, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": MISSING_VALUE, |
|
f"score_{WMQA_KEY}": 61.66, |
|
"date": datetime.datetime.today().strftime("%Y-%m-%d"), |
|
"organization": "test", |
|
"submitted_by": "octocat", |
|
}, |
|
{ |
|
"url": "https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf", |
|
"model": "Llava 1.6", |
|
"model_type": "Open", |
|
"system_prompt": "test", |
|
"vision_backbone": "CLIP", |
|
"llm_backbone": "Mistral", |
|
"num_frames": 16, |
|
f"score_{INTP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": MISSING_VALUE, |
|
f"score_{WMQA_KEY}": MISSING_VALUE, |
|
"date": datetime.datetime.today().strftime("%Y-%m-%d"), |
|
"organization": "test", |
|
"submitted_by": "octocat", |
|
}, |
|
{ |
|
"url": "https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf", |
|
"model": "Llava 1.6", |
|
"model_type": "Open", |
|
"system_prompt": "test", |
|
"vision_backbone": "CLIP", |
|
"llm_backbone": "Mistral", |
|
"num_frames": 16, |
|
f"score_{INTP_KEY}": 0.0, |
|
f"score_{MVP_KEY}": MISSING_VALUE, |
|
f"score_{MVP_MINI_KEY}": MISSING_VALUE, |
|
f"score_{WMQA_KEY}": 0.0, |
|
"date": datetime.datetime.today().strftime("%Y-%m-%d"), |
|
"organization": "test", |
|
"submitted_by": "octocat", |
|
}, |
|
] |
|
dt = DatasetDict({"valid": Dataset.from_list(rows), "test": Dataset.from_list(rows)}) |
|
|
|
contact_info = { |
|
"model": "llama", |
|
"url": "test", |
|
"organization": "test", |
|
"username": "test", |
|
"mail": "test", |
|
"date": datetime.datetime.today().strftime("%Y-%m-%d"), |
|
} |
|
cdt = DatasetDict({"valid": Dataset.from_list([contact_info]), "test": Dataset.from_list([contact_info])}) |
|
return dt, cdt |
|
|
|
|
|
DUMMY_DATA = False |
|
|
|
|
|
def get_eval_data(): |
|
if DUMMY_DATA: |
|
eval_results, _ = create_dummy_data() |
|
else: |
|
eval_results = load_dataset( |
|
RESULTS_DATASET, |
|
token=TOKEN, |
|
download_mode="force_redownload", |
|
verification_mode=VerificationMode.NO_CHECKS, |
|
trust_remote_code=True, |
|
) |
|
eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="valid") |
|
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test") |
|
return eval_results, eval_dataframe_val, eval_dataframe_test |
|
|
|
|
|
def restart_space(): |
|
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) |
|
|
|
|
|
|
|
|
|
|
|
def validate_mvp(submission_df, split="valid"): |
|
subsets = submission_df.data_name.unique() |
|
for subset in subsets: |
|
assert subset in [MVP_KEY, MVP_MINI_KEY], format_error( |
|
f"Wrong tasks, got {subset} but expecting either mvp or mvp_mini" |
|
) |
|
gold_tasks = get_dataset_config_names(MVP_DATASET, token=TOKEN) |
|
for subset in subsets: |
|
tasks = submission_df[submission_df.data_name == subset].task.unique() |
|
assert len(tasks) == len(gold_tasks), format_error( |
|
f"{MVP_NAME} submission must have all tasks, found = {tasks}, expecting = {gold_tasks}" |
|
) |
|
for task in tasks: |
|
sub_df = submission_df[(submission_df.data_name == subset) & (submission_df.task == task)].copy() |
|
assert task in gold_tasks, format_error(f"Found unknown task {task} for {MVP_NAME}, check submission") |
|
gold_dataset = load_dataset(MVP_DATASET, task, split="full" if subset == MVP_KEY else "mini", token=TOKEN) |
|
assert len(sub_df) == len(gold_dataset), format_error( |
|
f"Number of examples do not match in user submission, found {len(sub_df)} but expecting {len(gold_dataset)} for task {task} in split {subset}" |
|
) |
|
id2answer = {row["video_id"]: row["answer"] for row in gold_dataset} |
|
for i, r in sub_df.iterrows(): |
|
assert r["row_id"] in id2answer, format_error( |
|
f"Submission contains row_id {r['row_id']} which doesn't match the dataset's video_id" |
|
) |
|
|
|
|
|
def compute_scores_mvp(submission_df, split="valid"): |
|
gold_tasks = get_dataset_config_names(MVP_DATASET, token=TOKEN) |
|
subsets = submission_df.data_name.unique() |
|
scored_subs = [] |
|
for subset in subsets: |
|
tasks = submission_df[submission_df.data_name == subset].task.unique() |
|
assert len(tasks) == len(gold_tasks), format_error(f"{MVP_NAME} submission must have all tasks") |
|
for task in tasks: |
|
sub_df = submission_df[(submission_df.data_name == subset) & (submission_df.task == task)].copy() |
|
gold_dataset = load_dataset(MVP_DATASET, task, split="full" if subset == MVP_KEY else "mini", token=TOKEN) |
|
id2answer = {row["video_id"]: row["answer"] for row in gold_dataset} |
|
correct = [] |
|
for i, r in sub_df.iterrows(): |
|
gold_answer = id2answer[r["row_id"]] |
|
model_answer = r["model_answer"] |
|
if gold_answer == model_answer: |
|
correct.append(1) |
|
else: |
|
correct.append(0) |
|
sub_df["rating"] = correct |
|
scored_subs.append(sub_df) |
|
return pd.concat(scored_subs) |
|
|
|
|
|
def aggregate_scores_mvp(scored_submission_df, split="valid"): |
|
subsets = scored_submission_df.data_name.unique() |
|
subset_scores = {f"score_{s}": 0 for s in subsets} |
|
for subset in subsets: |
|
tasks = scored_submission_df[scored_submission_df.data_name == subset].task.unique() |
|
task_pair_accuracies = [] |
|
for task in tasks: |
|
sub_df = scored_submission_df[ |
|
(scored_submission_df.data_name == subset) & (scored_submission_df.task == task) |
|
].copy() |
|
result_by_vid = {} |
|
pair_correct_count = 0 |
|
for i, row in sub_df.iterrows(): |
|
video_id = "_".join(row["row_id"].split("_")[:-1]) |
|
if video_id not in result_by_vid: |
|
result_by_vid[video_id] = [row.to_dict()] |
|
else: |
|
result_by_vid[video_id].append(row.to_dict()) |
|
for video_id, answer_dict_pair in result_by_vid.items(): |
|
answer_dict_1, answer_dict_2 = answer_dict_pair |
|
if answer_dict_1["rating"] == 1 and answer_dict_2["rating"] == 1: |
|
pair_correct_count += 1 |
|
|
|
task_pair_accuracies.append((pair_correct_count / len(result_by_vid)) * 100) |
|
|
|
subset_scores[f"score_{subset}"] = np.mean(task_pair_accuracies) |
|
return subset_scores |
|
|
|
|
|
|
|
|
|
def validate_causalvqa(submission_df, split="test"): |
|
|
|
split = "train" |
|
subsets = submission_df.data_name.unique() |
|
for subset in subsets: |
|
assert subset in [WMQA_KEY], format_error( |
|
f"Wrong tasks, got {subset} but expecting causalvqa" |
|
) |
|
gold_tasks = get_dataset_config_names(WMQA_DATASET, token=TOKEN) |
|
for subset in subsets: |
|
tasks = "default" |
|
sub_df = submission_df[(submission_df.data_name == subset)].copy() |
|
gold_dataset = load_dataset(WMQA_DATASET, "", split="train", token=TOKEN) |
|
assert len(sub_df) == len(gold_dataset), format_error( |
|
f"Number of examples do not match in user submission, found {len(sub_df)} but expecting {len(gold_dataset)} for task {task} in split {subset}" |
|
) |
|
id2answer = {row["id"]+'_'+str(row["n"]): row["answer"] for row in gold_dataset} |
|
for i, r in sub_df.iterrows(): |
|
assert r["row_id"] in id2answer, format_error( |
|
f"Submission contains row_id {r['row_id']} which doesn't match the dataset's qid" |
|
) |
|
print('validated') |
|
|
|
def compute_scores_causalvqa(submission_df, split="test"): |
|
|
|
split = "train" |
|
gold_tasks = get_dataset_config_names(WMQA_DATASET, token=TOKEN) |
|
subsets = submission_df.data_name.unique() |
|
scored_subs = [] |
|
for subset in subsets: |
|
sub_df = submission_df[(submission_df.data_name == subset)].copy() |
|
sub_df['model_answer'] = sub_df['model_answer'].str.replace(r'[^a-eA-E]', '', regex=True, flags=re.IGNORECASE).str.upper() |
|
gold_dataset = load_dataset(WMQA_DATASET, "", split="train", token=TOKEN) |
|
gold_dataset = gold_dataset.to_pandas() |
|
gold_dataset['row_id'] = gold_dataset.apply(lambda x: x['id']+'_'+str(x['n']), axis=1) |
|
joined = pd.merge(gold_dataset, sub_df, on='row_id', how='left') |
|
correct = [] |
|
for i, r in joined.iterrows(): |
|
gold_answer = r['answer'] |
|
model_answer = r["model_answer"] |
|
if gold_answer == model_answer: |
|
correct.append(1) |
|
else: |
|
correct.append(0) |
|
joined["rating"] = correct |
|
scored_subs.append(joined) |
|
print(joined.columns) |
|
print('scored') |
|
return pd.concat(scored_subs) |
|
|
|
def aggregate_scores_causalvqa(scored_submission_df, split="test"): |
|
subsets = scored_submission_df.data_name.unique() |
|
subset_scores = {f"score_{s}": 0 for s in subsets} |
|
for subset in subsets: |
|
sub_df = scored_submission_df[scored_submission_df.data_name == subset].copy() |
|
agg_df = sub_df.groupby(['id','strata'])['rating'].sum().reset_index() |
|
agg_df['points'] = 0 |
|
agg_df.loc[agg_df['rating']==2, 'points'] = 1 |
|
|
|
|
|
|
|
subset_scores[f"score_{subset}"] = agg_df.points.mean()*100.00 |
|
print('aggregated') |
|
return subset_scores |
|
|
|
|
|
|
|
|
|
|
|
def validate_intphys(submission_df, split="test"): |
|
assert split == "test", format_error(f"Split {split} not available for dataset {INTP_NAME}") |
|
subsets = submission_df.data_name.unique() |
|
for subset in subsets: |
|
assert subset in [INTP_KEY], format_error( |
|
f"Wrong tasks, got {subset} but expecting " + INTP_KEY |
|
) |
|
gold_tasks = get_dataset_config_names(INTP_DATASET, token=TOKEN) |
|
for subset in subsets: |
|
sub_df = submission_df[(submission_df.data_name == subset)].copy() |
|
gold_dataset = load_dataset(INTP_DATASET, "", split="test") |
|
assert len(sub_df) == len(gold_dataset), format_error( |
|
f"Number of examples do not match in user submission, found {len(sub_df)} but expecting {len(gold_dataset)} in split {subset}" |
|
) |
|
id2answer = {row["name"]: row["answer"] for row in gold_dataset} |
|
for i, r in sub_df.iterrows(): |
|
assert r["row_id"] in id2answer, format_error( |
|
f"Submission contains row_id {r['row_id']} which doesn't match the dataset's video_id" |
|
) |
|
|
|
|
|
|
|
def compute_scores_intphys(submission_df, split="test"): |
|
assert split == "test", format_error(f"Split {split} not available for dataset {INTP_NAME}") |
|
gold_tasks = get_dataset_config_names(INTP_DATASET, token=TOKEN) |
|
subsets = submission_df.data_name.unique() |
|
scored_subs = [] |
|
for subset in subsets: |
|
sub_df = submission_df[(submission_df.data_name == subset)].copy() |
|
gold_dataset = load_dataset(INTP_DATASET, "", split="test", token=TOKEN) |
|
id2answer = {row["name"]: row["answer"] for row in gold_dataset} |
|
correct = [] |
|
for i, r in sub_df.iterrows(): |
|
gold_answer = id2answer[r["row_id"]] |
|
model_answer = r["model_answer"] |
|
if gold_answer == model_answer: |
|
correct.append(1) |
|
else: |
|
correct.append(0) |
|
sub_df["rating"] = correct |
|
scored_subs.append(sub_df) |
|
return pd.concat(scored_subs) |
|
|
|
|
|
def aggregate_scores_intphys(scored_submission_df, split="test"): |
|
subsets = scored_submission_df.data_name.unique() |
|
subset_scores = {f"score_{s}": 0 for s in subsets} |
|
accuracies = [] |
|
for subset in subsets: |
|
sub_df = scored_submission_df[ |
|
(scored_submission_df.data_name == subset) |
|
].copy() |
|
result_by_vid = {} |
|
pair_correct_count = 0 |
|
for i, row in sub_df.iterrows(): |
|
if row["rating"] == 1: |
|
pair_correct_count += 1 |
|
accuracies.append((pair_correct_count / len(sub_df)) * 100) |
|
|
|
subset_scores[f"score_{subset}"] = np.mean(accuracies) |
|
return subset_scores |
|
|
|
|
|
|
|
|
|
VALIDATION_FN = { |
|
MVP_KEY: validate_mvp, |
|
MVP_MINI_KEY: validate_mvp, |
|
INTP_KEY: validate_intphys, |
|
WMQA_KEY: validate_causalvqa, |
|
} |
|
|
|
SCORER_FN = { |
|
MVP_KEY: compute_scores_mvp, |
|
MVP_MINI_KEY: compute_scores_mvp, |
|
INTP_KEY: compute_scores_intphys, |
|
WMQA_KEY: compute_scores_causalvqa, |
|
} |
|
|
|
AGGREGATE_FN = { |
|
MVP_KEY: aggregate_scores_mvp, |
|
MVP_MINI_KEY: aggregate_scores_mvp, |
|
INTP_KEY: aggregate_scores_intphys, |
|
WMQA_KEY: aggregate_scores_causalvqa, |
|
} |
|
|
|
|
|
def compute_scores(submission_df, split="valid"): |
|
""" |
|
Runs the scores with held out valid/test sets, and updates the submission with metrics for each dataset |
|
- First, runs validation for the input to ensure the right keys are present |
|
- Then, runs the evaluations |
|
""" |
|
tasks = submission_df.data_name.unique() |
|
scored_subs = [] |
|
for t in tasks: |
|
task_sub = submission_df[submission_df.data_name == t].copy() |
|
scored_subs.append(SCORER_FN[t](task_sub, split)) |
|
scored_subs = pd.concat(scored_subs) |
|
return scored_subs |
|
|
|
|
|
def aggregate_scores(scored_df, split="valid"): |
|
tasks = scored_df.data_name.unique() |
|
agg_scores = {} |
|
for task in tasks: |
|
task_sub = scored_df[scored_df.data_name == task].copy() |
|
agg_metrics = AGGREGATE_FN[task](task_sub, split=split) |
|
agg_scores.update(agg_metrics) |
|
return agg_scores |
|
|
|
|
|
def validate_submission(submission_df, split="valid"): |
|
""" |
|
Validate user submissions |
|
""" |
|
|
|
assert "data_name" in submission_df.columns, format_error("Submission missing column data_name") |
|
assert "row_id" in submission_df.columns, format_error("Submission missing column row_id") |
|
assert "task" in submission_df.columns, format_error("Submission missing column task") |
|
assert "model_answer" in submission_df.columns, format_error("Submission missing column model_answer") |
|
tasks = submission_df.data_name.unique() |
|
valid_tasks = [t[0] for t in TASKS] + [MVP_MINI_KEY] |
|
for t in tasks: |
|
assert t in valid_tasks, format_error( |
|
f"Submission contains one or more rows with data_name={t}, which is not a valid task for this leaderboard (expecting to match a dataset in {valid_tasks})" |
|
) |
|
|
|
for task in tasks: |
|
task_sub = submission_df[submission_df.data_name == task].copy() |
|
VALIDATION_FN[task](task_sub) |
|
|
|
|
|
def add_new_eval( |
|
model: str, |
|
vision_backbone: str, |
|
llm_backbone: str, |
|
url: str, |
|
model_type: str, |
|
path_to_file: str, |
|
organization: str, |
|
mail: str, |
|
profile: gr.OAuthProfile, |
|
progress=gr.Progress(), |
|
): |
|
progress(0, desc="Validating user ...") |
|
contact_infos = load_dataset( |
|
CONTACT_DATASET, |
|
token=TOKEN, |
|
download_mode="force_redownload", |
|
verification_mode=VerificationMode.NO_CHECKS, |
|
trust_remote_code=True, |
|
) |
|
user_submission_dates = sorted( |
|
row["date"] for row in contact_infos["test"] if row["username"] == profile.username |
|
) |
|
|
|
if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime("%Y-%m-%d"): |
|
return format_error("You already submitted once today, please try again tomorrow.") |
|
|
|
_, parsed_mail = parseaddr(mail) |
|
if not "@" in parsed_mail: |
|
return format_warning("Please provide a valid email adress.") |
|
|
|
print("Adding new eval") |
|
progress(0.1, desc="Fetching recent evals ...") |
|
|
|
eval_results, _, _ = get_eval_data() |
|
|
|
|
|
|
|
|
|
|
|
|
|
if path_to_file is None: |
|
return format_warning("Please attach a file.") |
|
|
|
|
|
progress(0.3, desc="Validating user submission ...") |
|
file_path = path_to_file.name |
|
assert file_path.endswith(".jsonl"), format_error("Please submit a jsonl file") |
|
submissions_df = pd.read_json(file_path, lines=True, orient="records") |
|
validate_submission(submissions_df) |
|
|
|
|
|
if LOCAL_DEBUG: |
|
gr.Info("In local debug mode, mock uploading submission dataset.") |
|
else: |
|
api.upload_file( |
|
repo_id=SUBMISSION_DATASET, |
|
path_or_fileobj=path_to_file.name, |
|
path_in_repo=f"{organization}/{model}/submissions/test_raw_{datetime.datetime.today()}.jsonl", |
|
repo_type="dataset", |
|
token=TOKEN, |
|
) |
|
|
|
|
|
progress(0.5, desc="Computing scores ...") |
|
scored_df = compute_scores(submissions_df, split="test") |
|
|
|
|
|
if LOCAL_DEBUG: |
|
gr.Info("In local debug mode, mock uploading scored files") |
|
else: |
|
tasks = scored_df.data_name.unique() |
|
for task in tasks: |
|
scored_df.to_json(f"scored/{organization}_{model}_{task}.jsonl", lines=True, orient="records") |
|
api.upload_file( |
|
repo_id=SUBMISSION_DATASET, |
|
path_or_fileobj=f"scored/{organization}_{model}_{task}.jsonl", |
|
path_in_repo=f"{organization}/{model}/scored/{task}/test_scored_{datetime.datetime.today()}.jsonl", |
|
repo_type="dataset", |
|
token=TOKEN, |
|
) |
|
|
|
|
|
progress(0.7, desc="Submitting leaderboard entry ...") |
|
eval_entry = { |
|
"model": model, |
|
"model_type": model_type, |
|
"vision_backbone": vision_backbone, |
|
"llm_backbone": llm_backbone, |
|
"url": url, |
|
"organization": organization, |
|
"submitted_by": profile.username, |
|
"date": datetime.datetime.today().strftime("%Y-%m-%d"), |
|
} |
|
agg_metrics = aggregate_scores(scored_df, split="test") |
|
eval_entry.update(agg_metrics) |
|
|
|
task_keys = [t[0] for t in TASKS] + [MVP_MINI_KEY] |
|
missing_metrics = {f"score_{task}": MISSING_VALUE for task in task_keys if f"score_{task}" not in eval_entry} |
|
eval_entry.update(missing_metrics) |
|
|
|
eval_results["test"] = eval_results["test"].add_item(eval_entry) |
|
if LOCAL_DEBUG: |
|
print(eval_results["valid"][-1]) |
|
gr.Info("In local debug mode, mock uploading aggregated scores") |
|
else: |
|
eval_results.push_to_hub(RESULTS_DATASET, token=TOKEN) |
|
|
|
progress(0.9, desc="Updating contacts ...") |
|
contact_info = { |
|
"model": model, |
|
"url": url, |
|
"organization": organization, |
|
"username": profile.username, |
|
"mail": mail, |
|
"date": datetime.datetime.today().strftime("%Y-%m-%d"), |
|
} |
|
contact_infos["test"] = contact_infos["test"].add_item(contact_info) |
|
if LOCAL_DEBUG: |
|
print("mock uploaded contact info") |
|
else: |
|
contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN) |
|
|
|
progress(1.0, desc="Completed evaluation successfully. Please refresh leaderboard") |
|
success_str = f"Model {model} submitted by {organization} is successfully evaluated and stored in our database.\nPlease wait a few hours and refresh the leaderboard to see your score displayed." |
|
format_log(success_str) |
|
return success_str |
|
|
|
|
|
def on_filter_model_size_method_change(): |
|
_, eval_dataframe_val, eval_dataframe_test = get_eval_data() |
|
|
|
eval_dataframe_test = eval_dataframe_test[PRE_COL_NAMES + [f"{t} (%)" for _,t in VISIBLE_TASKS] + POST_COL_NAMES] |
|
datatypes = ["markdown"] + ["number" for _ in VISIBLE_TASKS] + ["text"] + ["text"] + ["text"] + ["date"] |
|
|
|
|
|
|
|
test_ldb = gr.components.Dataframe( |
|
value=eval_dataframe_test, datatype=datatypes, interactive=False, column_widths=["20%"] |
|
) |
|
return test_ldb |
|
|
|
|
|
def upload_file(files): |
|
file_paths = [file.name for file in files] |
|
return file_paths |
|
|
|
if __name__ == "__main__": |
|
|
|
_, eval_dataframe_val, eval_dataframe_test = get_eval_data() |
|
demo = gr.Blocks() |
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
with gr.Accordion("π Citation", open=False): |
|
gr.Markdown(CITATION_BUTTON_LABEL) |
|
gr.Markdown(CITATION_BUTTON_TEXT) |
|
|
|
|
|
datatypes = ["markdown"] + ["number" for _ in VISIBLE_TASKS] + ["text"] + ["text"] + ["text"] + ["date"] |
|
|
|
with gr.Tab("Results: Test"): |
|
leaderboard_table_test = gr.components.Dataframe( |
|
value=eval_dataframe_test, datatype=datatypes, interactive=False, column_widths=["20%"] |
|
) |
|
|
|
refresh_button = gr.Button("Refresh") |
|
refresh_button.click( |
|
|
|
on_filter_model_size_method_change, |
|
|
|
|
|
outputs=[ |
|
|
|
leaderboard_table_test, |
|
], |
|
) |
|
with gr.Accordion("Submit a new model for evaluation"): |
|
with gr.Row(): |
|
gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text") |
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
model_name_textbox = gr.Textbox(label="Model name") |
|
model_url = gr.Textbox(label="Model URL") |
|
model_type = gr.Dropdown(choices=["Open", "Closed"], label="Model Type") |
|
|
|
llm_backbone_textbox = gr.Textbox(label="LLM Backbone") |
|
vision_backbone_textbox = gr.Textbox(label="Vision Backbone") |
|
|
|
|
|
with gr.Column(): |
|
organization = gr.Textbox(label="Organization") |
|
mail = gr.Textbox( |
|
label="Contact email" |
|
) |
|
file_output = gr.File() |
|
submission_result = gr.Textbox(label="Status") |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.LoginButton() |
|
with gr.Column(): |
|
submit_button = gr.Button("Submit Eval") |
|
|
|
submit_button.click( |
|
add_new_eval, |
|
[ |
|
|
|
model_name_textbox, |
|
vision_backbone_textbox, |
|
llm_backbone_textbox, |
|
model_url, |
|
model_type, |
|
|
|
file_output, |
|
organization, |
|
mail, |
|
], |
|
submission_result, |
|
) |
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(restart_space, "interval", seconds=3600) |
|
scheduler.start() |
|
demo.launch(debug=True) |