File size: 8,827 Bytes
3e8020b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0de18f1
3e8020b
 
 
 
4d0571a
 
e8e50f1
 
ceb493a
4d0571a
de6b903
e8e50f1
3e8020b
e8e50f1
39f8744
3e8020b
 
 
 
62d48db
3e8020b
 
 
 
62d48db
 
3e8020b
 
 
 
 
 
 
 
 
 
 
 
62d48db
3e8020b
 
 
 
44ebc11
3e8020b
 
 
 
ceb493a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65d061e
 
ceb493a
65d061e
 
 
ceb493a
65d061e
ceb493a
65d061e
ceb493a
 
65d061e
 
ceb493a
 
 
 
 
 
65d061e
ceb493a
3e8020b
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# this is .py for store constants
MODEL_INFO = ["Model Type", "Model", "Language Model", "Question Type"]
MODEL_SIZE = ["<10B", ">=10B", "-"]
QUESTION_TYPE = ["Base", "Option", "Instruction"]
LEADERBOARD_VERSION = ["Version1"]
TASK_AAD_INFO = ["Overall Dual Acc.", "Overall Standard Acc.", "Overall UPD Acc.", "action_recognition", "attribute_comparison", "attribute_recognition", "celebrity_recognition", "function_reasoning", "future_prediction", "identity_reasoning", "image_emotion", "image_scene", "image_style", "image_topic", "nature_relation", "object_localization", "ocr", "physical_property_reasoning", "physical_relation", "social_relation", "structuralized_imagetext_understanding"]
TASK_IASD_INFO = ["Overall Dual Acc.", "Overall Standard Acc.", "Overall UPD Acc.", "action_recognition", "attribute_comparison", "attribute_recognition", "celebrity_recognition", "function_reasoning", "future_prediction", "identity_reasoning", "image_emotion", "image_scene", "image_style", "image_topic", "nature_relation", "object_localization", "ocr", "physical_property_reasoning", "physical_relation", "social_relation", "structuralized_imagetext_understanding"]
TASK_IVQD_INFO = ["Overall Dual Acc.", "Overall Standard Acc.", "Overall UPD Acc.", "action_recognition", "attribute_comparison", "attribute_recognition", "celebrity_recognition", "function_reasoning", "image_scene", "nature_relation", "object_localization", "ocr", "physical_property_reasoning", "physical_relation", "social_relation"]

AVG_INFO = ["Overall Dual Acc."]

DATA_AAD_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
DATA_IASD_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
DATA_IVQD_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]

CSV_AAD_RESULT_PATH = "./download_from_dataset/results/result_aad.csv"
CSV_IASD_RESULT_PATH = "./download_from_dataset/results/result_iasd.csv"
CSV_IVQD_RESULT_PATH = "./download_from_dataset/results/result_ivqd.csv"

CSV_QUEUE_DIR = "./download_from_dataset/queue/"

COLUMN_AAD_NAMES = MODEL_INFO + TASK_AAD_INFO
COLUMN_IASD_NAMES = MODEL_INFO + TASK_IASD_INFO
COLUMN_IVQD_NAMES = MODEL_INFO + TASK_IVQD_INFO

LEADERBORAD_VERSION = ["MM-AAD", "MM-IASD", "MM-IVQD"]


LEADERBORAD_INTRODUCTION = """
# UPD Leaderboard

### *"Which VLM is reliable?"* 🏆 Welcome to the leaderboard of the **UPD**! *Unsolvable Problem Detection: Evaluating Trustworthiness of Vision Language Models* (**arXiv 2024**)   [![Code](https://img.shields.io/github/stars/AtsuMiyai/UPD.svg?style=social&label=Official)](https://github.com/AtsuMiyai/UPD) 
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
<a href='https://arxiv.org/abs/2403.20331'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
</div>

MM-UPD Bench: A Comprehensive Benchmark for Evaluating the Trustworthiness of Vision Language Models (VLMs) in the Context of Unsolvable Problem Detection (UPD)

Our MM-UPD Bench encompasses three benchmarks: MM-AAD, MM-IASD, and MM-IVQD.

Through these benchmarks, we aim to provide a comprehensive evaluation of VLMs across multiple senarios.

As for more detail information, we can refer to `About` section.


Please follow the instructions in [UPD](https://github.com/AtsuMiyai/UPD) to upload the generated JSON file here. After clicking the `Submit Eval` button, click the `Refresh` button.
Besides, we are open to any PRs into [UPD](https://github.com/AtsuMiyai/UPD) for adding your VLMs.
"""


SUBMIT_INTRODUCTION = """# Submit on MM-UPD Benchmark Introduction
    1. Obtain Dual Result JSON File from our [github repository](https://github.com/AtsuMiyai/UPD/tree/main/scripts/inference).
    2. If you want to update model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify LLaVA-1.5-13B's performance, you need to fill in 'LLaVA-1.5-13B' in 'Revision Model Name'.
    3. Please provide the correct link of your model's repository for each submission.  
    4. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.

    Note: The example of the submitted JSON file is this url: [llava1.5_13b_result_dual_detail_submission.json](https://drive.google.com/file/d/1ILYlxcKC_a5Jrm7kyyqeHo0vo3WjkA1V/view?usp=sharing).    
          You need to care about whether (i) the JSON file has the prediction for all data, (ii) the data on all options, "hit_upd", "hit_standard", and "hit" exist.

    ## Submit Example
    If you want to upload LLaVA-1.5-13B's result in the leaderboard, you need to:
    1. Select VLM in 'Model Type'.
    2. Fill in 'LLaVA-1.5-13B' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
    3. Fill in 'LLaVA-1.5-13B' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
    4. Fill in 'https://github.com/haotian-liu/LLaVA' in 'Model Link'.
    5. Fill in '13B' in 'Model size'.
    6. Select 'Vicuna-1.5-13B' in 'LLM Type'.
    7. Fill in 'LLM model' if you select Others for 'LLM Type'.
    8. Select 'AAD', 'IASD', or 'IVQD' in 'UPD_Type'.
    9. Select 'Base', 'Option', or 'Instruction' in 'Question Type'.
    10. Upload results.json.
    11. Click the 'Submit Eval' button.
    12. Click 'Refresh' to obtain the uploaded leaderboard.

    ### If you have any questions or deletion requests, please contact [miyai@cvm.t.u-tokyo.ac.jp](miyai@cvm.t.u-tokyo.ac.jp).
    ### ⚠️ Please do not submit any malicious file (e.g, files you manually edited).
"""


LEADERBORAD_INFO = """
## What is MM-UPD Bench?
MM-UPD Bench: A Comprehensive Benchmark for Evaluating the Trustworthiness of Vision Language Models (VLMs) in the Context of Unsolvable Problem Detection (UPD)

Our MM-UPD Bench encompasses three benchmarks: MM-AAD, MM-IASD, and MM-IVQD.

1\. **MM-AAD:** Benchmark for Absent Answer Detection (AAD).  
MM-AAD Bench is a dataset where the correct answer option for each question is removed.  
MM-AAD tests the model's capability to recognize when the correct answer is absent from the provided choices.

2\. **MM-IASD:** Benchmark for Incompatible Answer Set Detection (IASD).  
MM-IASD Bench is a dataset where the answer set is completely incompatible with the context specified by the question and the image.  
MM-IASD tests the model's capability to recognize when the answer set is incompatible with the context.

3\. **MM-IVQD:** Benchmark for Incompatible Visual Question Detection (IVQD).  
MM-IVQD Bench is a dataset where the question is incompatible with the image.  
MM-IVQD evaluates the VLMs' capability to discern when a question and image are irrelevant or inappropriate.

We carefully decompose each benchmark into various abilities to reveal individual model's strengths and weaknesses.


## Evaluation Scenario
We evaluate the performance of VLMs on MM-UPD Bench using the following settings:
1. **Base:** In the Base setting, we do not provide any instruction to withold answers.

2. **Option:** In the Option setting, we provide an additional option (e.g., None of the above) to withold answers.

3. **Instruction:** In the Instruction setting, we provide an additional instruction (e.g., If all the options are incorrect, answer F. None of the above.) to withold answers.



## Evaluation Metrics
We evaluate the performance of VLMs on MM-UPD Bench using the following metrics:
1. **Dual accuracy:** The accuracy on standard-UPD pairs, where we count
success only if the model is correct on both the standard and UPD questions.

2. **Standard accuracy:** The accuracy on standard questions.

3. **UPD (AAD/IASD/IVQD) accuracy:** The accuracy of AAD/IASD/IVQD questions.

"""


CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@article{miyai2024unsolvable,
  title={{Unsolvable Problem Detection}: Evaluating Trustworthiness of Vision Language Models},
  author={Miyai, Atsuyuki and Yang, Jingkang and Zhang, Jingyang and Ming, Yifei and Yu, Qing and Irie, Go and Li, Yixuan and Li, Hai and Liu, Ziwei and Aizawa, Kiyoharu},
  journal={arXiv preprint arXiv:2403.20331},
  year={2024}
}"""