Spaces:
Running
Running
File size: 8,330 Bytes
6ae8aba 64dcfc0 6ae8aba 392edcb 6ae8aba 64dcfc0 6ae8aba 64dcfc0 6ae8aba 74aafd0 6ae8aba 392edcb 74aafd0 392edcb 6ae8aba baae497 74aafd0 64dcfc0 74aafd0 baae497 74aafd0 64dcfc0 74aafd0 64dcfc0 6ae8aba 74aafd0 6ae8aba 74aafd0 6ae8aba 74aafd0 592b931 74aafd0 5fe97ad 74aafd0 6ae8aba 74aafd0 6ae8aba 74aafd0 64dcfc0 74aafd0 64dcfc0 74aafd0 64dcfc0 74aafd0 6ae8aba 64dcfc0 6ae8aba baae497 64dcfc0 baae497 64dcfc0 6ae8aba 64dcfc0 6ae8aba 64dcfc0 6ae8aba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import json
import os
import pandas as pd
from utils import create_hyperlinked_names, process_model_size
def sum_lol(lol):
assert isinstance(lol, list) and all(isinstance(i, list) for i in lol), f"Input should be a list of lists, got {type(lol)}"
total = []
for sublist in lol:
total.extend(sublist)
return total
SCORE_BASE_DIR = "scores"
META_DATA = ["model_name", "model_size", "url"]
DATASETS = {
"image": {
"I-CLS": ['VOC2007', 'N24News', 'SUN397', 'ObjectNet', 'Country211', 'Place365', 'ImageNet-1K', 'HatefulMemes', 'ImageNet-A', 'ImageNet-R'],
"I-QA": ['OK-VQA', 'A-OKVQA', 'DocVQA', 'InfographicsVQA', 'ChartQA', 'Visual7W-Pointing', 'ScienceQA', 'GQA', 'TextVQA', 'VizWiz'],
"I-RET": ['VisDial', 'CIRR', 'VisualNews_t2i', 'VisualNews_i2t', 'MSCOCO_t2i', 'MSCOCO_i2t', 'NIGHTS', 'WebQA', 'FashionIQ', 'Wiki-SS-NQ', 'OVEN', 'EDIS'],
"I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W']
},
"visdoc": {
"VisDoc": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry', 'VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA', 'ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc', "ViDoRe_esg_reports_human_labeled_v2", "ViDoRe_biomedical_lectures_v2", "ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2", "ViDoRe_esg_reports_v2_multilingual"]
},
"video": {
"V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
"V-QA": ['Video-MME', 'MVBench', 'NExTQA', 'EgoSchema'],
"V-RET": ['MSR-VTT', 'MSVD', 'DiDeMo', 'VATEX', 'YouCook2'],
"V-MRET": ['QVHighlight', 'Charades-STA', 'MomentSeeker', 'ActivityNetQA']
}
}
ALL_DATASETS_SPLITS = {k: sum_lol(list(v.values())) for k, v in DATASETS.items()}
ALL_DATASETS = sum_lol(list(ALL_DATASETS_SPLITS.values()))
MODALITIES = list(DATASETS.keys())
SPECIAL_METRICS = {
'__default__': 'hit@1',
}
BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
TASKS = ["Overall", "I-CLS", "I-QA", "I-RET", "I-VG", "VisDoc", "V-CLS", "V-QA", "V-RET", "V-MRET"]
BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'VisDoc']
DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
['number'] * 3
SUB_TASKS_I = TASKS[1:5]
TASKS_I = ['Image-Overall'] + SUB_TASKS_I + ALL_DATASETS_SPLITS['image']
COLUMN_NAMES_I = BASE_COLS + TASKS_I
DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
['number'] * (len(TASKS_I) + 4)
SUB_TASKS_V = TASKS[6:10]
TASKS_V = ['Video-Overall'] + SUB_TASKS_V + ALL_DATASETS_SPLITS['video']
COLUMN_NAMES_V = BASE_COLS + TASKS_V
DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
['number'] * (len(TASKS_V) + 4)
TASKS_D = ['VisDoc'] + ALL_DATASETS_SPLITS['visdoc']
COLUMN_NAMES_D = BASE_COLS + TASKS_D
DATA_TITLE_TYPE_D = BASE_DATA_TITLE_TYPE + \
['number'] * len(TASKS_D)
TABLE_INTRODUCTION = """**MMEB**: Massive MultiModal Embedding Benchmark \n
Models are ranked based on **Overall**"""
TABLE_INTRODUCTION_I = """**I-CLS**: Image Classification, **I-QA**: (Image) Visual Question Answering, **I-RET**: Image Retrieval, **I-VG**: (Image) Visual Grounding \n
Models are ranked based on **Image-Overall**"""
TABLE_INTRODUCTION_V = """**V-CLS**: Video Classification, **V-QA**: (Video) Visual Question Answering, **V-RET**: Video Retrieval, **V-MRET**: Video Moment Retrieval \n
Models are ranked based on **Video-Overall**"""
TABLE_INTRODUCTION_D = """**VisDoc**: Visual Document Understanding \n
Models are ranked based on **VisDoc**"""
LEADERBOARD_INFO = """
## Dataset Summary
"""
CITATION_BUTTON_TEXT = r"""TBA"""
def load_single_json(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
return data
def load_data(base_dir=SCORE_BASE_DIR):
all_data = []
for file_name in os.listdir(base_dir):
if file_name.endswith('.json'):
file_path = os.path.join(base_dir, file_name)
data = load_single_json(file_path)
all_data.append(data)
return all_data
def load_scores(raw_scores=None):
"""This function loads the raw scores from the user provided scores summary and flattens them into a single dictionary."""
all_scores = {}
for modality, datasets_list in DATASETS.items(): # Ex.: ('image', {'I-CLS': [...], 'I-QA': [...]})
for sub_task, datasets in datasets_list.items(): # Ex.: ('I-CLS', ['VOC2007', 'N24News', ...])
for dataset in datasets: # Ex.: 'VOC2007'
score = raw_scores.get(modality, {}).get(dataset, 0.0)
score = 0.0 if score == "FILE_N/A" else score
metric = SPECIAL_METRICS.get(dataset, 'hit@1')
if isinstance(score, dict):
if modality == 'visdoc':
metric = "ndcg_linear@5" if "ndcg_linear@5" in score else "ndcg@5"
score = score.get(metric, 0.0)
all_scores[dataset] = round(score * 100.0, 2)
return all_scores
def calculate_score(raw_scores=None):
"""This function calculates the overall average scores for all datasets as well as avg scores for each modality and sub-task based on the raw scores.
"""
def get_avg(sum_score, leng):
avg = sum_score / leng if leng > 0 else 0.0
avg = round(avg, 2) # Round to 2 decimal places
return avg
all_scores = load_scores(raw_scores)
avg_scores = {}
# Calculate overall score for all datasets
avg_scores['Overall'] = get_avg(sum(all_scores.values()), len(ALL_DATASETS))
# Calculate scores for each modality
for modality in MODALITIES:
datasets_for_each_modality = ALL_DATASETS_SPLITS[modality]
avg_scores[f"{modality.capitalize()}-Overall"] = get_avg(
sum(all_scores.get(dataset, 0.0) for dataset in datasets_for_each_modality),
len(datasets_for_each_modality)
)
# Calculate scores for each sub-task
for modality, datasets_list in DATASETS.items():
for sub_task, datasets in datasets_list.items():
sub_task_score = sum(all_scores.get(dataset, 0.0) for dataset in datasets)
avg_scores[sub_task] = get_avg(sub_task_score, len(datasets))
all_scores.update(avg_scores)
return all_scores
def generate_model_row(data):
metadata = data['metadata']
row = {
'Models': metadata.get('model_name', None),
'Model Size(B)': metadata.get('model_size', None),
'URL': metadata.get('url', None),
'Data Source': metadata.get('data_source', 'Self-Reported'),
}
scores = calculate_score(data['metrics'])
row.update(scores)
return row
def rank_models(df, column='Overall', rank_name='Rank'):
"""Ranks the models based on the specific score."""
df = df.sort_values(by=column, ascending=False).reset_index(drop=True)
df[rank_name] = range(1, len(df) + 1)
return df
def get_df():
"""Generates a DataFrame from the loaded data."""
all_data = load_data()
rows = [generate_model_row(data) for data in all_data]
df = pd.DataFrame(rows)
df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
df = create_hyperlinked_names(df)
df = rank_models(df, column='Overall')
return df
def refresh_data():
df = get_df()
return df[COLUMN_NAMES]
def search_and_filter_models(df, query, min_size, max_size):
filtered_df = df.copy()
if query:
filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]
size_mask = filtered_df['Model Size(B)'].apply(lambda x:
(min_size <= 1000.0 <= max_size) if x == 'unknown'
else (min_size <= x <= max_size))
filtered_df = filtered_df[size_mask]
return filtered_df[COLUMN_NAMES] |