Spaces:
Running
Running
File size: 6,507 Bytes
6ae8aba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import json
import os
import pandas as pd
from utils import create_hyperlinked_names
def sum_lst(lst):
assert isinstance(lst, list) and lst, f"Input should be a non-empty list, got {type(lst)}, size {len(lst)}"
total = lst[0]
for item in lst[1:]:
assert isinstance(item, (list, int, float)), f"Expected types are list and numbers, got {type(item)}"
total += item
return total
SCORE_BASE_DIR = "Scores"
META_DATA = ["model_name", "model_size", "url"]
DATASETS = {
"image": {
"I-CLS": ['VOC2007', 'N24News', 'SUN397', 'ObjectNet', 'Country211', 'Place365', 'ImageNet-1K', 'HatefulMemes', 'ImageNet-A', 'ImageNet-R'],
"I-QA": ['OK-VQA', 'A-OKVQA', 'DocVQA', 'InfographicsVQA', 'ChartQA', 'Visual7W-Pointing', 'ScienceQA', 'GQA', 'TextVQA', 'VizWiz'],
"I-RET": ['VisDial', 'CIRR', 'VisualNews_t2i', 'VisualNews_i2t', 'MSCOCO_t2i', 'MSCOCO_i2t', 'NIGHTS', 'WebQA', 'FashionIQ', 'Wiki-SS-NQ', 'OVEN', 'EDIS'],
"I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W']
},
"visdoc": {
"VisDoc": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry', 'VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA', 'ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc']
},
"video": {
"V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
"V-QA": ['Video-MME', 'MVBench', 'NExTQA', 'EgoSchema'],
"V-RET": ['MSR-VTT', 'MSVD', 'DiDeMo', 'VATEX', 'YouCook2'],
"V-MRET": ['QVHighlight', 'Charades-STA', 'MomentSeeker', 'ActivityNetQA']
}
}
ALL_DATASETS_SPLITS = {k: sum_lst(list(v.values())) for k, v in DATASETS.items()}
ALL_DATASETS = sum_lst(list(ALL_DATASETS_SPLITS.values()))
MODALITIES = list(DATASETS.keys())
SPECIAL_METRICS = {
'__default__': 'hit@1',
}
BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
TASKS = ["Overall", "Image-Overall", "I-CLS", "I-QA", "I-RET", "I-VG", "VisDoc", "Video-Overall", "V-CLS", "V-QA", "V-RET", "V-MRET"]
COLUMN_NAMES = BASE_COLS + TASKS
DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown'] + \
['number'] * len(TASKS)
TABLE_INTRODUCTION = """"""
LEADERBOARD_INFO = """
## Dataset Summary
"""
CITATION_BUTTON_TEXT = r""""""
def load_single_json(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
return data
def load_data(base_dir=SCORE_BASE_DIR):
all_data = []
for file_name in os.listdir(base_dir):
if file_name.endswith('-scores_report.json'):
file_path = os.path.join(base_dir, file_name)
data = load_single_json(file_path)
all_data.append(data)
return all_data
def calculate_score(raw_scores=None):
"""This function calculates the overall average scores for all datasets as well as avg scores for each modality and sub-task based on the raw scores.
Algorithm summary:
"""
def get_avg(sum_score, leng):
avg = sum_score / leng if leng > 0 else 0.0
avg = round(avg, 2) # Round to 2 decimal places
return avg
avg_scores = {}
overall_scores_summary = {} # Stores the scores sum and length for each modality and all datasets
for modality, datasets_list in DATASETS.items(): # Ex.: ('image', {'I-CLS': [...], 'I-QA': [...]})
overall_scores_summary[modality] = (0.0, 0) # Initialize the sum and count for each modality
for sub_task, datasets in datasets_list.items(): # Ex.: ('I-CLS', ['VOC2007', 'N24News', ...])
sub_task_sum_score, sub_task_datasets_len = 0.0, len(datasets)
for dataset in datasets: # Ex.: 'VOC2007'
score = raw_scores.get(modality, {}).get(dataset, 0.0)
score = 0.0 if score == "FILE_N/A" else score
metric = SPECIAL_METRICS.get(dataset, 'hit@1')
if isinstance(score, dict):
score = score.get(metric, 0.0)
sub_task_sum_score += score
sub_task_overall = get_avg(sub_task_sum_score, sub_task_datasets_len)
avg_scores[sub_task] = sub_task_overall
# Accumulate the scores sum and length for the each modality
modality_sum_score, modality_datasets_len = overall_scores_summary[modality]
modality_sum_score += sub_task_sum_score
modality_datasets_len += sub_task_datasets_len
overall_scores_summary[modality] = (modality_sum_score, modality_datasets_len)
all_datasets_sum_score, all_datasets_len = 0.0, 0
for modality, (modality_sum_score, modality_datasets_len) in overall_scores_summary.items():
name = f"{modality.capitalize()}-Overall"
avg_scores[name] = get_avg(modality_sum_score, modality_datasets_len)
# Accumulate the scores sum and length for all datasets
all_datasets_sum_score += modality_sum_score
all_datasets_len += modality_datasets_len
avg_scores['Overall'] = get_avg(all_datasets_sum_score, all_datasets_len)
return avg_scores
def generate_model_row(data):
metadata = data['metadata']
row = {
'Models': metadata.get('model_name', None),
'Model Size(B)': metadata.get('model_size', None),
'URL': metadata.get('url', None)
}
scores = calculate_score(data['metrics'])
row.update(scores)
return row
def get_df():
"""Generates a DataFrame from the loaded data."""
all_data = load_data()
rows = [generate_model_row(data) for data in all_data]
df = pd.DataFrame(rows)
df = df.sort_values(by='Overall', ascending=False).reset_index(drop=True)
df['Rank'] = range(1, len(df) + 1)
df = create_hyperlinked_names(df)
return df
def refresh_data():
df = get_df()
return df[COLUMN_NAMES]
def search_and_filter_models(df, query, min_size, max_size):
filtered_df = df.copy()
if query:
filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]
size_mask = filtered_df['Model Size(B)'].apply(lambda x:
(min_size <= 1000.0 <= max_size) if x == 'unknown'
else (min_size <= x <= max_size))
filtered_df = filtered_df[size_mask]
return filtered_df[COLUMN_NAMES] |