Spaces:

MM-UPD
/

MM-UPD_Leaderboard

Running

App Files Files Community

AtsuMiyai commited on Jun 4, 2024

Commit

3e8020b

0 Parent(s):

initial commit

Browse files

Files changed (10) hide show

.gitattributes +55 -0
.gitignore +13 -0
.pre-commit-config.yaml +53 -0
Makefile +13 -0
README.md +45 -0
app.py +702 -0
constants.py +90 -0
pyproject.toml +13 -0
requirements.txt +19 -0
src/utils_display.py +99 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,55 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+auto_evals/
+venv/
+__pycache__/
+.env
+.ipynb_checkpoints
+*ipynb
+.vscode/
+eval-queue/
+eval-results/
+eval-queue-bk/
+eval-results-bk/
+logs/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+default_language_version:
+  python: python3
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: quarterly
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-case-conflict
+      - id: detect-private-key
+      - id: check-added-large-files
+        args: ['--maxkb=1000']
+      - id: requirements-txt-fixer
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: Format imports
+  - repo: https://github.com/psf/black
+    rev: 22.12.0
+    hooks:
+      - id: black
+        name: Format code
+        additional_dependencies: ['click==8.0.2']
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    # Ruff version.
+    rev: 'v0.0.267'
+    hooks:
+      - id: ruff

Makefile ADDED Viewed

	@@ -0,0 +1,13 @@

+.PHONY: style format
+style:
+	python -m black --line-length 119 .
+	python -m isort .
+	ruff check --fix .
+quality:
+	python -m black --check --line-length 119 .
+	python -m isort --check-only .
+	ruff check .

README.md ADDED Viewed

	@@ -0,0 +1,45 @@

+---
+title: Demo Leaderboard
+emoji: 🥇
+colorFrom: green
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.4.0
+app_file: app.py
+pinned: true
+license: apache-2.0
+---
+# Start the configuration
+Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
+Results files should have the following format and be stored as json files:
+```json
+{
+    "config": {
+        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
+        "model_name": "path of the model on the hub: org/model",
+        "model_sha": "revision on the hub",
+    },
+    "results": {
+        "task_name": {
+            "metric_name": score,
+        },
+        "task_name2": {
+            "metric_name": score,
+        }
+    }
+}
+```
+Request files are created automatically by this tool.
+If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
+# Code logic for more complex edits
+You'll find
+- the main table' columns names and properties in `src/display/utils.py`
+- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
+- teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

app.py ADDED Viewed

	@@ -0,0 +1,702 @@

+__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
+import gradio as gr
+import pandas as pd
+import re
+import pandas as pd
+import numpy as np
+from collections import defaultdict
+from constants import *
+import os
+from huggingface_hub import Repository
+global data_component_aad, data_component_iasd, data_component_ivqd, filter_component
+TOKEN = os.environ.get("TOKEN")
+repo = Repository(local_dir="./download_from_dataset", clone_from="MM-UPD/results_for_leaderboard", repo_type="dataset", use_auth_token=TOKEN)
+current_directory = os.getcwd()
+def validate_model_size(s):
+    pattern = r'^\d+B$|^-$'
+    if re.match(pattern, s):
+        return s
+    else:
+        return '-'
+def upload_file(files):
+    file_paths = [file.name for file in files]
+    return file_paths
+# Accuracy Report
+def report_acc(df, groupd='category', metric_type="dual"):
+    assert 'split' in df
+    assert groupd in [None, 'category', 'l2-category']
+    res = defaultdict(list)
+    res['split'] = ['test']
+    if groupd is None:
+        if metric_type == "dual":
+            res['overall'] = [
+                np.mean(df['hit']),
+            ]
+        elif metric_type == "standard":
+            res['overall'] = [
+                np.mean(df['hit_standard']),
+            ]
+        elif metric_type == "upd":
+            res['overall'] = [
+                np.mean(df['hit_upd']),
+            ]
+        return pd.DataFrame(res)
+    elif groupd in df:
+        abilities = list(set(df[groupd]))
+        abilities.sort()
+        for ab in abilities:
+            sub_df = df[df[groupd] == ab]
+            if metric_type == "dual":
+                res[ab] = [
+                    np.mean(sub_df['hit']),
+                ]
+            elif metric_type == "standard":
+                res[ab] = [
+                    np.mean(sub_df['hit_standard']),
+                ]
+            elif metric_type == "upd":
+                res[ab] = [
+                    np.mean(sub_df['hit_upd']),
+                ]
+        return pd.DataFrame(res)
+def eval_result_dual(data_main, metric_type="dual"):
+    overall = report_acc(data_main, None, metric_type)
+    leaf = report_acc(data_main, 'category', metric_type)
+    overall = round(overall['overall'].values[0] * 100, 1)
+    leaf = leaf.iloc[:, 1:].values.flatten().tolist()
+    leaf = [round(x * 100, 1) for x in leaf]
+    return overall, leaf
+def calculate_score(dual_df_path):
+    dual_df = pd.read_excel(dual_df_path)
+    overall_dual, leaf_dual = eval_result_dual(dual_df)
+    overall_standard, leaf_standard = eval_result_dual(dual_df, metric_type="standard")
+    overall_upd, leaf_upd = eval_result_dual(dual_df, metric_type="upd")
+    return overall_dual, overall_standard, overall_upd, leaf_dual
+# add the new data into the queue
+def add_queue(base_df, dual_df_path, model_name):
+    dual_df = pd.read_excel(dual_df_path)
+    base_df[f"{model_name}_prediction_standard"] = dual_df["prediction_standard"]
+    base_df[f"{model_name}_hit_standard"] = dual_df["hit_standard"]
+    base_df[f"{model_name}_prediction_upd"] = dual_df["prediction_upd"]
+    base_df[f"{model_name}_hit_upd"] = dual_df["hit_upd"]
+    base_df[f"{model_name}_hit"] = dual_df["hit"]
+    return base_df
+# check whether the input file is correct or not
+def validity_check(input, UPD_type, question_type):
+    input_df = pd.read_excel(input)
+    # check for the correct data size
+    data_num_dict = {"AAD": 820, "IASD": 919, "IVQD": 356}
+    assert len(input_df) == data_num_dict[UPD_type], "Different Data Size"
+    print("len(input)", len(input_df))
+    print("data_num_dict[UPD_type]", data_num_dict[UPD_type])
+    # check for missing columns
+    column_list = ["hit_upd", "hit_standard", "hit", "prediction_upd", "prediction_standard"]
+    assert all(x in input_df.columns for x in column_list), "Column Missing"
+    # check for missing values
+    assert not input_df[column_list].isnull().any().any(), "Missing values found in columns"
+    # check for the presence of the correct values
+    option_mapping = {"AAD": "None of the above", "IASD": "None of the above", "IVQD": "The image and question are irrelevant."}
+    instruction_mapping = {"AAD": "F. None of the above", "IASD": "F. None of the above", "IVQD": "F. The image and question are irrelevant."}
+    input_df["D_upd"] = input_df["D_upd"].fillna("")
+    if question_type == "Base":
+        assert not input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]} found in Base"
+        assert not input_df["prediction_upd"].str.contains(instruction_mapping[UPD_type]).any(), f"{instruction_mapping[UPD_type]} found in Base"
+    elif question_type == "Option":
+        assert input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]}not found in Option"
+        assert not input_df["prediction_upd"].str.contains(instruction_mapping[UPD_type]).any(), f"{instruction_mapping[UPD_type]} found in Option"
+    elif question_type == "Instruction":
+        assert not input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]} found in Instruction"
+    return True
+def add_new_eval(
+    input_file,
+    model_type: str,
+    model_name_textbox: str,
+    revision_name_textbox: str,
+    model_link: str,
+    model_size: str,
+    upd_type: str,
+    LLM_type: str,
+    LLM_name_textbox: str,
+    question_type: str
+):
+    if input_file is None:
+        warning_text = "Error! Empty file!"
+        print(warning_text)
+        return warning_text
+    else:
+        model_size = validate_model_size(model_size)
+        if upd_type == 'AAD':
+            csv_path = CSV_AAD_RESULT_PATH
+        elif upd_type == 'IASD':
+            csv_path = CSV_IASD_RESULT_PATH
+        elif upd_type == 'IVQD':
+            csv_path = CSV_IVQD_RESULT_PATH
+        validity_check(input_file, upd_type, question_type)
+        csv_data = pd.read_csv(csv_path)
+        overall_dual_acc, overall_standard_acc, overall_upd_acc, leaf_dual = calculate_score(input_file)
+        if LLM_type == 'Other':
+            LLM_name = LLM_name_textbox
+        else:
+            LLM_name = LLM_type
+        if revision_name_textbox == '':
+            col = csv_data.shape[0]
+            model_name = model_name_textbox
+        else:
+            model_name = revision_name_textbox
+            model_name_list = csv_data['Model']
+            name_list = [name.split(']')[0][1:] for name in model_name_list]
+            if revision_name_textbox not in name_list:
+                col = csv_data.shape[0]
+            else:
+                col = name_list.index(revision_name_textbox)
+        if model_link == '':
+            model_name = model_name  # no url
+        else:
+            model_name = '[' + model_name + '](' + model_link + ')'
+        # add new data
+        new_data = [
+            model_type,
+            model_name,
+            LLM_name,
+            model_size,
+            question_type,
+            overall_dual_acc,
+            overall_standard_acc,
+            overall_upd_acc,
+            ]
+        new_data += leaf_dual
+        # If the same data already exists, return an error.
+        if new_data in csv_data.values.tolist():
+            warning_text = "Error! The same data already exists!"
+            print(warning_text)
+            return warning_text
+        # If the same model name already exists, return an error.
+        elif new_data[:5] in csv_data.values.tolist():
+            warning_text = "Error! The same data already exists! Please fill revision_name."
+            print(warning_text)
+            return warning_text
+        csv_data.loc[col] = new_data
+        csv_data = csv_data.to_csv(csv_path, index=False)
+        absolute_result_path = os.path.abspath(csv_path)
+        if not os.path.exists(absolute_result_path):
+            raise FileNotFoundError(f"File {absolute_result_path} not found")
+        repo.git_pull()
+        repo.git_add(absolute_result_path)
+        csv_queue_path = os.path.join(CSV_QUEUE_DIR, f"detail_results_{upd_type.lower()}_{question_type.lower()}.csv")
+        base_data = pd.read_csv(csv_queue_path)
+        base_data = add_queue(base_data, input_file, model_name)
+        base_data.to_csv(csv_queue_path, index=False)
+        absolute_queue_path = os.path.abspath(csv_queue_path)
+        if not os.path.exists(absolute_queue_path):
+            raise FileNotFoundError(f"File {absolute_queue_path} not found")
+        repo.git_add(absolute_queue_path)
+        repo.git_commit(f"add {model_name} results in {question_type}")
+        repo.git_push()
+    return 0
+def get_baseline_aad_df():
+    repo.git_pull()
+    df = pd.read_csv(CSV_AAD_RESULT_PATH)
+    df = df.sort_values(by="Overall Dual Acc.", ascending=False)
+    present_columns = MODEL_INFO + checkbox_aad_group.value
+    df = df[present_columns]
+    return df
+def get_all_aad_df():
+    repo.git_pull()
+    df = pd.read_csv(CSV_AAD_RESULT_PATH)
+    df = df.sort_values(by="Overall Dual Acc.", ascending=False)
+    return df
+def get_baseline_iasd_df():
+    repo.git_pull()
+    df = pd.read_csv(CSV_IASD_RESULT_PATH)
+    df = df.sort_values(by="Overall Dual Acc.", ascending=False)
+    present_columns = MODEL_INFO + checkbox_iasd_group.value
+    df = df[present_columns]
+    return df
+def get_all_iasd_df():
+    repo.git_pull()
+    df = pd.read_csv(CSV_IASD_RESULT_PATH)
+    df = df.sort_values(by="Overall Dual Acc.", ascending=False)
+    return df
+def get_baseline_ivqd_df():
+    repo.git_pull()
+    df = pd.read_csv(CSV_IVQD_RESULT_PATH)
+    df = df.sort_values(by="Overall Dual Acc.", ascending=False)
+    present_columns = MODEL_INFO + checkbox_ivqd_group.value
+    df = df[present_columns]
+    return df
+def get_all_ivqd_df():
+    repo.git_pull()
+    df = pd.read_csv(CSV_IVQD_RESULT_PATH)
+    df = df.sort_values(by="Overall Dual Acc.", ascending=False)
+    return df
+block = gr.Blocks()
+with block:
+    gr.Markdown(
+        LEADERBORAD_INTRODUCTION
+    )
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        # table mmupd bench
+        with gr.TabItem("🏅 MM-AAD Benchmark", elem_id="mmaad-benchmark-tab-table", id=1):
+            with gr.Row():
+                with gr.Accordion("Citation", open=False):
+                    citation_button = gr.Textbox(
+                        value=CITATION_BUTTON_TEXT,
+                        label=CITATION_BUTTON_LABEL,
+                        elem_id="citation-button",
+                        show_copy_button=True,
+                    )
+            # selection for column part:
+            checkbox_aad_group = gr.CheckboxGroup(
+                choices=TASK_AAD_INFO,
+                value=AVG_INFO,
+                label="Evaluation Dimension",
+                interactive=True,
+            ) # user can select the evaluation dimension
+            with gr.Row():
+                # selection for model size part:
+                model_size = gr.CheckboxGroup(
+                    choices=MODEL_SIZE,
+                    value=MODEL_SIZE,
+                    label="Model Size",
+                    interactive=True,
+                )
+                # selection for model size part:
+                question_type = gr.CheckboxGroup(
+                    choices=QUESTION_TYPE,
+                    value=QUESTION_TYPE,
+                    label="Question Type",
+                    interactive=True,
+                )
+            baseline_value = get_baseline_aad_df()
+            baseline_header = MODEL_INFO + checkbox_aad_group.value
+            baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_aad_group.value)
+            data_component_aad = gr.components.Dataframe(
+                value=baseline_value,
+                headers=baseline_header,
+                type="pandas",
+                datatype=baseline_datatype,
+                interactive=False,
+                visible=True,
+                )
+            def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns):
+                updated_data = get_all_aad_df()
+                # model_size & question_type:
+                def custom_filter(row, model_size_filters, question_type_filters):
+                    model_size = row['Model Size']
+                    question_type = row['Question Type']
+                    model_size = model_size.upper()
+                    if model_size == '-':
+                        size_filter = '-' in model_size_filters
+                    elif 'B' in model_size:
+                        size = float(model_size.replace('B', ''))
+                        size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
+                    else:
+                        size_filter = False
+                    question_type_filter = question_type in question_type_filters
+                    return size_filter and question_type_filter
+                mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type)
+                updated_data = updated_data[mask]
+                # columns:
+                selected_columns = [item for item in TASK_AAD_INFO if item in selected_columns]
+                present_columns = MODEL_INFO + selected_columns
+                updated_data = updated_data[present_columns]
+                updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
+                updated_headers = present_columns
+                update_datatype = [DATA_AAD_TITILE_TYPE[COLUMN_AAD_NAMES.index(x)] for x in updated_headers]
+                filter_component = gr.components.Dataframe(
+                    value=updated_data,
+                    headers=updated_headers,
+                    type="pandas",
+                    datatype=update_datatype,
+                    interactive=False,
+                    visible=True,
+                    )
+                return filter_component
+            model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
+            question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
+            checkbox_aad_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
+        with gr.TabItem("🏅 MM-IASD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=2):
+            with gr.Row():
+                with gr.Accordion("Citation", open=False):
+                    citation_button = gr.Textbox(
+                        value=CITATION_BUTTON_TEXT,
+                        label=CITATION_BUTTON_LABEL,
+                        elem_id="citation-button",
+                        show_copy_button=True,
+                    )
+            checkbox_iasd_group = gr.CheckboxGroup(
+                choices=TASK_IASD_INFO,
+                value=AVG_INFO,
+                label="Evaluation Dimension",
+                interactive=True,
+            ) # user can select the evaluation dimension
+            with gr.Row():
+                # selection for model size part:
+                model_size = gr.CheckboxGroup(
+                    choices=MODEL_SIZE,
+                    value=MODEL_SIZE,
+                    label="Model Size",
+                    interactive=True,
+                )
+                # selection for model size part:
+                question_type = gr.CheckboxGroup(
+                    choices=QUESTION_TYPE,
+                    value=QUESTION_TYPE,
+                    label="Question Type",
+                    interactive=True,
+                )
+            baseline_value = get_baseline_iasd_df()
+            baseline_header = MODEL_INFO + checkbox_iasd_group.value
+            baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_iasd_group.value)
+            data_component_iasd = gr.components.Dataframe(
+                value=baseline_value,
+                headers=baseline_header,
+                type="pandas",
+                datatype=baseline_datatype,
+                interactive=False,
+                visible=True,
+                )
+            def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns):
+                updated_data = get_all_iasd_df()
+                def custom_filter(row, model_size_filters, question_type_filters):
+                    model_size = row['Model Size']
+                    question_type = row['Question Type']
+                    model_size = model_size.upper()
+                    if model_size == '-':
+                        size_filter = '-' in model_size_filters
+                    elif 'B' in model_size:
+                        size = float(model_size.replace('B', ''))
+                        size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
+                    else:
+                        size_filter = False
+                    question_type_filter = question_type in question_type_filters
+                    return size_filter and question_type_filter
+                mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type)
+                updated_data = updated_data[mask]
+                # columns:
+                selected_columns = [item for item in TASK_IASD_INFO if item in selected_columns]
+                present_columns = MODEL_INFO + selected_columns
+                updated_data = updated_data[present_columns]
+                updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
+                updated_headers = present_columns
+                update_datatype = [DATA_IASD_TITILE_TYPE[COLUMN_IASD_NAMES.index(x)] for x in updated_headers]
+                filter_component = gr.components.Dataframe(
+                    value=updated_data,
+                    headers=updated_headers,
+                    type="pandas",
+                    datatype=update_datatype,
+                    interactive=False,
+                    visible=True,
+                    )
+                return filter_component
+            model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
+            question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
+            checkbox_iasd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
+        # Table 3
+        with gr.TabItem("🏅 MM-IVQD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=3):
+            with gr.Row():
+                with gr.Accordion("Citation", open=False):
+                    citation_button = gr.Textbox(
+                        value=CITATION_BUTTON_TEXT,
+                        label=CITATION_BUTTON_LABEL,
+                        elem_id="citation-button",
+                        show_copy_button=True,
+                    )
+            # selection for column part:
+            checkbox_ivqd_group = gr.CheckboxGroup(
+                choices=TASK_IVQD_INFO,
+                value=AVG_INFO,
+                label="Evaluation Dimension",
+                interactive=True,
+            )  # user can select the evaluation dimension
+            with gr.Row():
+                # selection for model size part:
+                model_size = gr.CheckboxGroup(
+                    choices=MODEL_SIZE,
+                    value=MODEL_SIZE,
+                    label="Model Size",
+                    interactive=True,
+                )
+                # selection for model size part:
+                question_type = gr.CheckboxGroup(
+                    choices=QUESTION_TYPE,
+                    value=QUESTION_TYPE,
+                    label="Question Type",
+                    interactive=True,
+                )
+            baseline_value = get_baseline_ivqd_df()
+            baseline_header = MODEL_INFO + checkbox_ivqd_group.value
+            baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_ivqd_group.value)
+            data_component_ivqd = gr.components.Dataframe(
+                value=baseline_value,
+                headers=baseline_header,
+                type="pandas",
+                datatype=baseline_datatype,
+                interactive=False,
+                visible=True,
+                )
+            def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns):
+                updated_data = get_all_ivqd_df()
+                def custom_filter(row, model_size_filters, question_type_filters):
+                    model_size = row['Model Size']
+                    question_type = row['Question Type']
+                    model_size = model_size.upper()
+                    if model_size == '-':
+                        size_filter = '-' in model_size_filters
+                    elif 'B' in model_size:
+                        size = float(model_size.replace('B', ''))
+                        size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
+                    else:
+                        size_filter = False
+                    question_type_filter = question_type in question_type_filters
+                    return size_filter and question_type_filter
+                mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type)
+                updated_data = updated_data[mask]
+                selected_columns = [item for item in TASK_IVQD_INFO if item in selected_columns]
+                present_columns = MODEL_INFO + selected_columns
+                updated_data = updated_data[present_columns]
+                updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
+                updated_headers = present_columns
+                update_datatype = [DATA_IVQD_TITILE_TYPE[COLUMN_IVQD_NAMES.index(x)] for x in updated_headers]
+                filter_component = gr.components.Dataframe(
+                    value=updated_data,
+                    headers=updated_headers,
+                    type="pandas",
+                    datatype=update_datatype,
+                    interactive=False,
+                    visible=True,
+                    )
+                return filter_component
+            model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
+            question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
+            checkbox_ivqd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
+        # table 4
+        with gr.TabItem("📝 About", elem_id="mmupd-benchmark-tab-table", id=4):
+            gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
+        # table 5
+        with gr.TabItem("🚀 Submit here! ", elem_id="mmupd-benchmark-tab-table", id=5):
+            gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
+            with gr.Row():
+                gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
+            with gr.Row():
+                with gr.Column():
+                    model_type = gr.Dropdown(
+                        choices=["VLM", "LLM"],
+                        label="Model type",
+                        multiselect=False,
+                        value="VLM",
+                        interactive=True,
+                    )
+                    model_name_textbox = gr.Textbox(
+                        label="Model name", placeholder="LLaMA-7B"
+                        )
+                    revision_name_textbox = gr.Textbox(
+                        label="Revision Model Name", placeholder="LLaMA-7B"
+                    )
+                    model_link = gr.Textbox(
+                        label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
+                    )
+                    model_size = gr.Textbox(
+                        label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
+                    )
+                with gr.Column():
+                    LLM_type = gr.Dropdown(
+                        choices=["Vicuna-1.5-7B", "Vicuna-1.5-13B", "Flan-T5-XL", "LLaMA-7B", "Llama-13B", "Llama-3-8B", "Llama-3-70B", "Yi-34B", "Mistral-7B", "Other"],
+                        label="LLM type",
+                        multiselect=False,
+                        value="Vicuna-1.5-13B",
+                        interactive=True,
+                    )
+                    LLM_name_textbox = gr.Textbox(
+                        label="LLM model (Required for Other)",
+                        placeholder="GPT-4",
+                    )
+                    upd_type = gr.Dropdown(
+                        choices=[
+                            "AAD",
+                            "IASD",
+                            "IVQD",
+                        ],
+                        label="UPD type",
+                        multiselect=False,
+                        value="AAD",
+                        interactive=True,
+                    )
+                    question_type = gr.Dropdown(
+                        choices=QUESTION_TYPE,
+                        label="Question Type",
+                        multiselect=False,
+                        value=QUESTION_TYPE[0],
+                        interactive=True,
+                    )
+            with gr.Column():
+                input_file = gr.components.File(label="Click to Upload a Dual Evaluation File", file_count="single", type='binary')
+                submit_button = gr.Button("Submit Eval")
+                submission_result = gr.Markdown()
+                submit_button.click(
+                    add_new_eval,
+                    inputs = [
+                        input_file,
+                        model_type,
+                        model_name_textbox,
+                        revision_name_textbox,
+                        model_link,
+                        model_size,
+                        upd_type,
+                        LLM_type,
+                        LLM_name_textbox,
+                        question_type
+                    ],
+                )
+    def refresh_data():
+        value1 = get_baseline_aad_df()
+        value2 = get_baseline_iasd_df()
+        value3 = get_baseline_ivqd_df()
+        return value1, value2, value3
+    with gr.Row():
+        data_run = gr.Button("Refresh")
+        data_run.click(
+            refresh_data, outputs=[data_component_aad, data_component_iasd, data_component_ivqd]
+        )
+block.launch()

constants.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# this is .py for store constants
+MODEL_INFO = ["Model Type", "Model", "Language Model", "Question Type"]
+MODEL_SIZE = ["<10B", ">=10B", "-"]
+QUESTION_TYPE = ["Base", "Option", "Instruction"]
+LEADERBOARD_VERSION = ["Version1"]
+TASK_AAD_INFO = ["Overall Dual Acc.", "Overall Standard Acc.", "Overall UPD Acc.", "action_recognition", "attribute_comparison", "attribute_recognition", "celebrity_recognition", "function_reasoning", "future_prediction", "identity_reasoning", "image_emotion", "image_scene", "image_style", "image_topic", "nature_relation", "object_localization", "ocr", "physical_property_reasoning", "physical_relation", "social_relation", "structuralized_imagetext_understanding"]
+TASK_IASD_INFO = ["Overall Dual Acc.", "Overall Standard Acc.", "Overall UPD Acc.", "action_recognition", "attribute_comparison", "attribute_recognition", "celebrity_recognition", "function_reasoning", "future_prediction", "identity_reasoning", "image_emotion", "image_scene", "image_style", "image_topic", "nature_relation", "object_localization", "ocr", "physical_property_reasoning", "physical_relation", "social_relation", "structuralized_imagetext_understanding"]
+TASK_IVQD_INFO = ["Overall Dual Acc.", "Overall Standard Acc.", "Overall UPD Acc.", "action_recognition", "attribute_comparison", "attribute_recognition", "celebrity_recognition", "function_reasoning", "image_scene", "nature_relation", "object_localization", "ocr", "physical_property_reasoning", "physical_relation", "social_relation"]
+AVG_INFO = ["Overall Dual Acc."]
+DATA_AAD_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
+DATA_IASD_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
+DATA_IVQD_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
+CSV_AAD_RESULT_PATH = "./download_from_dataset/results/result_aad.csv"
+CSV_IASD_RESULT_PATH = "./download_from_dataset/results/result_iasd.csv"
+CSV_IVQD_RESULT_PATH = "./download_from_dataset/results/result_ivqd.csv"
+CSV_QUEUE_DIR = "./download_from_dataset/queue/"
+COLUMN_AAD_NAMES = MODEL_INFO + TASK_AAD_INFO
+COLUMN_IASD_NAMES = MODEL_INFO + TASK_IASD_INFO
+COLUMN_IVQD_NAMES = MODEL_INFO + TASK_IVQD_INFO
+LEADERBORAD_VERSION = ["MM-AAD", "MM-IASD", "MM-IVQD"]
+LEADERBORAD_INTRODUCTION = """
+# UPD Leaderboard
+*"Which VLM is reliable?"*
+🏆 Welcome to the leaderboard of the **UPD**! *Unsolvable Problem Detection: Evaluating Trustworthiness of Vision Language Models* (**arXiv 2024**)   [![Code](https://img.shields.io/github/stars/AtsuMiyai/UPD.svg?style=social&label=Official)](https://github.com/AtsuMiyai/UPD)
+<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
+<a href='https://arxiv.org/abs/2403.20331'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
+</div>
+- **Multiple Senario Evaluation:** We carefully design prompts choices and examine the three senario: (i) base (no instruction), (ii) option (add an additional option), (iii) instruction (add an instruction).
+- **Ability-wise Evaluation:** We carefully decompose each benchmark into more than 10 abilities to reveal individual model's strengths and weaknesses.
+- **Valuable Insights:** MM-UPD Bench provides multi-perspective insights on trustworthiness and reliablitity for the community.
+Please follow the instructions in [UPD](https://github.com/AtsuMiyai/UPD) to upload the generated `result_dual.xlsx` file here. After clicking the `Submit Eval` button, click the `Refresh` button.
+"""
+SUBMIT_INTRODUCTION = """# Submit on MM-UPD Benchmark Introduction
+    1. Obtain Dual Result Excel File from our [github repository](https://github.com/AtsuMiyai/UPD/tree/main/scripts/inference).
+    2. If you want to update model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify LLaVA-1.5-13B's performance, you need to fill in 'LLaVA-1.5-13B' in 'Revision Model Name'.
+    3. Please provide the correct link of your model's repository for each submission.
+    4. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
+    Note: The example of the submitted excel file is this url: [llava1.5_13b_result_dual.xlsx](https://docs.google.com/spreadsheets/d/1Se0_iYHr6aktHFnCzwArU1ExTjL-UmeO/edit?usp=sharing&ouid=103623120947968158097&rtpof=true&sd=true).
+          You need to care about whether (i) the excel file has the prediction for all data, (ii) the columns on hit_upd, hit_standard, and hit exist.
+    ## Submit Example
+    If you want to upload LLaVA-1.5-13B's result in the leaderboard, you need to:
+    1. Select VLM in 'Model Type'.
+    2. Fill in 'LLaVA-1.5-13B' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
+    3. Fill in 'LLaVA-1.5-13B' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
+    4. Fill in 'https://github.com/haotian-liu/LLaVA' in 'Model Link'.
+    5. Fill in '13B' in 'Model size'.
+    6. Select 'Vicuna-1.5-13B' in 'LLM Type'.
+    7. Fill in 'LLM model' if you select Others for 'LLM Type'.
+    8. Select 'AAD', 'IASD', or 'IVQD' in 'UPD_Type'.
+    9. Select 'Base', 'Option', or 'Instruction' in 'Question Type'.
+    10. Upload results.xlsx.
+    11. Click the 'Submit Eval' button.
+    12. Click 'Refresh' to obtain the uploaded leaderboard.
+    ### If you have any questions or deletion requests, please contact [miyai@cvm.t.u-tokyo.ac.jp](miyai@cvm.t.u-tokyo.ac.jp).
+    ### ⚠️ Please do not submit any malicious content.
+"""
+LEADERBORAD_INFO = """
+      MM-UPD Bench is a comprehensive benchmark for evaluating the trustworthiness of Vision Language Models
+      (VLMs) in the context of Unsolvable Problem Detection (UPD). MM-UPD encompasses three benchmarks:
+      MM-AAD, MM-IASD, and MM-IVQD. Each benchmark cover a wide range of abilities. Through these benchmarks,
+      we aim to provide a comprehensive evaluation of VLMs across multiple senarios.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@article{miyai2024unsolvable,
+  title={{Unsolvable Problem Detection}: Evaluating Trustworthiness of Vision Language Models},
+  author={Miyai, Atsuyuki and Yang, Jingkang and Zhang, Jingyang and Ming, Yifei and Yu, Qing and Irie, Go and Li, Yixuan and Li, Hai and Liu, Ziwei and Aizawa, Kiyoharu},
+  journal={arXiv preprint arXiv:2403.20331},
+  year={2024}
+}"""

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"] # line too long (black is taking care of this)
+line-length = 119
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+[tool.isort]
+profile = "black"
+line_length = 119
+[tool.black]
+line-length = 119

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+APScheduler==3.10.1
+black==23.11.0
+click==8.1.3
+datasets==2.14.5
+gradio==4.4.0
+gradio_client==0.7.0
+huggingface-hub>=0.23.2
+matplotlib==3.7.1
+numpy==1.24.2
+pandas==2.0.0
+python-dateutil==2.8.2
+requests==2.28.2
+tqdm==4.65.0
+transformers==4.35.2
+tokenizers>=0.15.0
+git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
+accelerate==0.24.1
+sentencepiece
+openpyxl

src/utils_display.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from dataclasses import dataclass
+# These classes are for user facing column names, to avoid having to change them
+# all around the code when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+@dataclass(frozen=True)
+class AutoEvalColumn: # Auto evals column
+    model_type_symbol = ColumnContent("T", "str", True)
+    model = ColumnContent("Model", "markdown", True)
+    average = ColumnContent("Average ⬆️", "number", True)
+    arc = ColumnContent("ARC", "number", True)
+    hellaswag = ColumnContent("HellaSwag", "number", True)
+    mmlu = ColumnContent("MMLU", "number", True)
+    truthfulqa = ColumnContent("TruthfulQA", "number", True)
+    model_type = ColumnContent("Type", "str", False)
+    precision = ColumnContent("Precision", "str", False, True)
+    license = ColumnContent("Hub License", "str", False)
+    params = ColumnContent("#Params (B)", "number", False)
+    likes = ColumnContent("Hub ❤️", "number", False)
+    revision = ColumnContent("Model sha", "str", False, False)
+    dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
+@dataclass(frozen=True)
+class EloEvalColumn: # Elo evals column
+    model = ColumnContent("Model", "markdown", True)
+    gpt4 = ColumnContent("GPT-4 (all)", "number", True)
+    human_all = ColumnContent("Human (all)", "number", True)
+    human_instruct = ColumnContent("Human (instruct)", "number", True)
+    human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
+@dataclass(frozen=True)
+class EvalQueueColumn: # Queue column
+    model = ColumnContent("model", "markdown", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    precision = ColumnContent("precision", "bool", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
+    status = ColumnContent("status", "str", True)
+LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
+KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
+VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
+OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
+DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
+MODEL_PAGE = "https://huggingface.co/models"
+LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
+VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
+ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    if model_name in LLAMAS:
+        link = LLAMA_LINK
+        model_name = model_name.split("/")[1]
+    elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
+        link = VICUNA_LINK
+        model_name = "stable-vicuna-13b"
+    elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
+        link = ALPACA_LINK
+        model_name = "alpaca-13b"
+    if model_name == "dolly-12b":
+        link = DOLLY_LINK
+    elif model_name == "vicuna-13b":
+        link = VICUNA_LINK
+    elif model_name == "koala-13b":
+        link = KOALA_LINK
+    elif model_name == "oasst-12b":
+        link = OASST_LINK
+    #else:
+    #    link = MODEL_PAGE
+    return model_hyperlink(link, model_name)
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"