File size: 8,330 Bytes
6ae8aba
 
 
64dcfc0
 
 
 
 
 
 
6ae8aba
 
392edcb
6ae8aba
 
 
 
 
 
 
 
 
64dcfc0
6ae8aba
 
 
 
 
 
 
 
64dcfc0
 
6ae8aba
 
 
 
 
 
74aafd0
 
6ae8aba
392edcb
74aafd0
392edcb
6ae8aba
baae497
 
74aafd0
 
64dcfc0
74aafd0
baae497
 
74aafd0
 
64dcfc0
74aafd0
 
 
 
 
 
64dcfc0
 
 
 
 
 
 
 
6ae8aba
 
 
 
 
74aafd0
6ae8aba
 
 
 
 
 
 
 
 
74aafd0
6ae8aba
 
 
 
 
74aafd0
 
 
 
 
 
 
 
 
 
592b931
 
74aafd0
5fe97ad
74aafd0
 
6ae8aba
 
 
 
 
 
 
 
74aafd0
6ae8aba
74aafd0
 
64dcfc0
74aafd0
 
 
64dcfc0
74aafd0
 
 
 
64dcfc0
74aafd0
 
 
 
 
 
 
 
6ae8aba
 
 
 
 
 
64dcfc0
 
6ae8aba
 
 
 
 
baae497
64dcfc0
 
baae497
64dcfc0
 
6ae8aba
 
 
 
 
64dcfc0
6ae8aba
64dcfc0
6ae8aba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import json
import os
import pandas as pd
from utils import create_hyperlinked_names, process_model_size

def sum_lol(lol):
    assert isinstance(lol, list) and all(isinstance(i, list) for i in lol), f"Input should be a list of lists, got {type(lol)}"
    total = []
    for sublist in lol:
        total.extend(sublist)
    return total

SCORE_BASE_DIR = "scores"
META_DATA = ["model_name", "model_size", "url"]
DATASETS = {
    "image": {
        "I-CLS": ['VOC2007', 'N24News', 'SUN397', 'ObjectNet', 'Country211', 'Place365', 'ImageNet-1K', 'HatefulMemes', 'ImageNet-A', 'ImageNet-R'], 
        "I-QA": ['OK-VQA', 'A-OKVQA', 'DocVQA', 'InfographicsVQA', 'ChartQA', 'Visual7W-Pointing', 'ScienceQA', 'GQA', 'TextVQA', 'VizWiz'], 
        "I-RET": ['VisDial', 'CIRR', 'VisualNews_t2i', 'VisualNews_i2t', 'MSCOCO_t2i', 'MSCOCO_i2t', 'NIGHTS', 'WebQA', 'FashionIQ', 'Wiki-SS-NQ', 'OVEN', 'EDIS'],
        "I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W']
        }, 
    "visdoc": {
        "VisDoc": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry', 'VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA', 'ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc', "ViDoRe_esg_reports_human_labeled_v2", "ViDoRe_biomedical_lectures_v2", "ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2", "ViDoRe_esg_reports_v2_multilingual"]
        }, 
    "video": {
        "V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'], 
        "V-QA": ['Video-MME', 'MVBench', 'NExTQA', 'EgoSchema'], 
        "V-RET": ['MSR-VTT', 'MSVD', 'DiDeMo', 'VATEX', 'YouCook2'], 
        "V-MRET": ['QVHighlight', 'Charades-STA', 'MomentSeeker', 'ActivityNetQA']
        }
}
ALL_DATASETS_SPLITS = {k: sum_lol(list(v.values())) for k, v in DATASETS.items()}
ALL_DATASETS = sum_lol(list(ALL_DATASETS_SPLITS.values()))
MODALITIES = list(DATASETS.keys())
SPECIAL_METRICS = {
    '__default__': 'hit@1',
}

BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
TASKS = ["Overall", "I-CLS", "I-QA", "I-RET", "I-VG", "VisDoc", "V-CLS", "V-QA", "V-RET", "V-MRET"]
BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']

COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'VisDoc']
DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
                    ['number'] * 3

SUB_TASKS_I = TASKS[1:5]
TASKS_I = ['Image-Overall'] + SUB_TASKS_I + ALL_DATASETS_SPLITS['image']
COLUMN_NAMES_I = BASE_COLS + TASKS_I
DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
                    ['number'] * (len(TASKS_I) + 4)

SUB_TASKS_V = TASKS[6:10]
TASKS_V = ['Video-Overall'] + SUB_TASKS_V + ALL_DATASETS_SPLITS['video']
COLUMN_NAMES_V = BASE_COLS + TASKS_V
DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
                    ['number'] * (len(TASKS_V) + 4)

TASKS_D = ['VisDoc'] + ALL_DATASETS_SPLITS['visdoc']
COLUMN_NAMES_D = BASE_COLS + TASKS_D
DATA_TITLE_TYPE_D = BASE_DATA_TITLE_TYPE + \
                    ['number'] * len(TASKS_D)

TABLE_INTRODUCTION = """**MMEB**: Massive MultiModal Embedding Benchmark \n
                        Models are ranked based on **Overall**"""
TABLE_INTRODUCTION_I = """**I-CLS**: Image Classification, **I-QA**: (Image) Visual Question Answering, **I-RET**: Image Retrieval, **I-VG**: (Image) Visual Grounding \n
                        Models are ranked based on **Image-Overall**"""
TABLE_INTRODUCTION_V = """**V-CLS**: Video Classification, **V-QA**: (Video) Visual Question Answering, **V-RET**: Video Retrieval, **V-MRET**: Video Moment Retrieval \n
                        Models are ranked based on **Video-Overall**"""
TABLE_INTRODUCTION_D = """**VisDoc**: Visual Document Understanding \n
                        Models are ranked based on **VisDoc**"""

LEADERBOARD_INFO = """
## Dataset Summary
"""

CITATION_BUTTON_TEXT = r"""TBA"""

def load_single_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def load_data(base_dir=SCORE_BASE_DIR):
    all_data = []
    for file_name in os.listdir(base_dir):
        if file_name.endswith('.json'):
            file_path = os.path.join(base_dir, file_name)
            data = load_single_json(file_path)
            all_data.append(data)
    return all_data

def load_scores(raw_scores=None):
    """This function loads the raw scores from the user provided scores summary and flattens them into a single dictionary."""
    all_scores = {}
    for modality, datasets_list in DATASETS.items(): # Ex.: ('image', {'I-CLS': [...], 'I-QA': [...]})
        for sub_task, datasets in datasets_list.items(): # Ex.: ('I-CLS', ['VOC2007', 'N24News', ...])
            for dataset in datasets: # Ex.: 'VOC2007'
                score = raw_scores.get(modality, {}).get(dataset, 0.0)
                score = 0.0 if score == "FILE_N/A" else score
                metric = SPECIAL_METRICS.get(dataset, 'hit@1')
                if isinstance(score, dict):
                    if modality == 'visdoc':
                        metric = "ndcg_linear@5" if "ndcg_linear@5" in score else "ndcg@5"
                    score = score.get(metric, 0.0)
                all_scores[dataset] = round(score * 100.0, 2)
    return all_scores

def calculate_score(raw_scores=None):
    """This function calculates the overall average scores for all datasets as well as avg scores for each modality and sub-task based on the raw scores.
    """
    def get_avg(sum_score, leng):
        avg = sum_score / leng if leng > 0 else 0.0
        avg = round(avg, 2)  # Round to 2 decimal places
        return avg
    
    all_scores = load_scores(raw_scores)
    avg_scores = {}

    # Calculate overall score for all datasets
    avg_scores['Overall'] = get_avg(sum(all_scores.values()), len(ALL_DATASETS))

    # Calculate scores for each modality
    for modality in MODALITIES:
        datasets_for_each_modality = ALL_DATASETS_SPLITS[modality]
        avg_scores[f"{modality.capitalize()}-Overall"] = get_avg(
            sum(all_scores.get(dataset, 0.0) for dataset in datasets_for_each_modality),
            len(datasets_for_each_modality)
        )
    
    # Calculate scores for each sub-task
    for modality, datasets_list in DATASETS.items():
        for sub_task, datasets in datasets_list.items():
            sub_task_score = sum(all_scores.get(dataset, 0.0) for dataset in datasets)
            avg_scores[sub_task] = get_avg(sub_task_score, len(datasets))

    all_scores.update(avg_scores)
    return all_scores

def generate_model_row(data):
    metadata = data['metadata']
    row = {
        'Models': metadata.get('model_name', None), 
        'Model Size(B)': metadata.get('model_size', None),
        'URL': metadata.get('url', None), 
        'Data Source': metadata.get('data_source', 'Self-Reported'),
    }
    scores = calculate_score(data['metrics'])
    row.update(scores)
    return row

def rank_models(df, column='Overall', rank_name='Rank'):
    """Ranks the models based on the specific score."""
    df = df.sort_values(by=column, ascending=False).reset_index(drop=True)
    df[rank_name] = range(1, len(df) + 1)
    return df

def get_df():
    """Generates a DataFrame from the loaded data."""
    all_data = load_data()
    rows = [generate_model_row(data) for data in all_data]
    df = pd.DataFrame(rows)
    df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
    df = create_hyperlinked_names(df)
    df = rank_models(df, column='Overall')
    return df

def refresh_data():
    df = get_df()
    return df[COLUMN_NAMES]

def search_and_filter_models(df, query, min_size, max_size):
    filtered_df = df.copy()
    
    if query:
        filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]

    size_mask = filtered_df['Model Size(B)'].apply(lambda x: 
        (min_size <= 1000.0 <= max_size) if x == 'unknown' 
        else (min_size <= x <= max_size))
    
    filtered_df = filtered_df[size_mask]
    
    return filtered_df[COLUMN_NAMES]