File size: 7,082 Bytes
3e8020b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62d48db
3e8020b
 
 
 
62d48db
3e8020b
 
 
 
62d48db
 
3e8020b
 
 
 
 
 
 
 
 
 
 
 
62d48db
3e8020b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# this is .py for store constants
MODEL_INFO = ["Model Type", "Model", "Language Model", "Question Type"]
MODEL_SIZE = ["<10B", ">=10B", "-"]
QUESTION_TYPE = ["Base", "Option", "Instruction"]
LEADERBOARD_VERSION = ["Version1"]
TASK_AAD_INFO = ["Overall Dual Acc.", "Overall Standard Acc.", "Overall UPD Acc.", "action_recognition", "attribute_comparison", "attribute_recognition", "celebrity_recognition", "function_reasoning", "future_prediction", "identity_reasoning", "image_emotion", "image_scene", "image_style", "image_topic", "nature_relation", "object_localization", "ocr", "physical_property_reasoning", "physical_relation", "social_relation", "structuralized_imagetext_understanding"]
TASK_IASD_INFO = ["Overall Dual Acc.", "Overall Standard Acc.", "Overall UPD Acc.", "action_recognition", "attribute_comparison", "attribute_recognition", "celebrity_recognition", "function_reasoning", "future_prediction", "identity_reasoning", "image_emotion", "image_scene", "image_style", "image_topic", "nature_relation", "object_localization", "ocr", "physical_property_reasoning", "physical_relation", "social_relation", "structuralized_imagetext_understanding"]
TASK_IVQD_INFO = ["Overall Dual Acc.", "Overall Standard Acc.", "Overall UPD Acc.", "action_recognition", "attribute_comparison", "attribute_recognition", "celebrity_recognition", "function_reasoning", "image_scene", "nature_relation", "object_localization", "ocr", "physical_property_reasoning", "physical_relation", "social_relation"]

AVG_INFO = ["Overall Dual Acc."]

DATA_AAD_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
DATA_IASD_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
DATA_IVQD_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]

CSV_AAD_RESULT_PATH = "./download_from_dataset/results/result_aad.csv"
CSV_IASD_RESULT_PATH = "./download_from_dataset/results/result_iasd.csv"
CSV_IVQD_RESULT_PATH = "./download_from_dataset/results/result_ivqd.csv"

CSV_QUEUE_DIR = "./download_from_dataset/queue/"

COLUMN_AAD_NAMES = MODEL_INFO + TASK_AAD_INFO
COLUMN_IASD_NAMES = MODEL_INFO + TASK_IASD_INFO
COLUMN_IVQD_NAMES = MODEL_INFO + TASK_IVQD_INFO

LEADERBORAD_VERSION = ["MM-AAD", "MM-IASD", "MM-IVQD"]


LEADERBORAD_INTRODUCTION = """
# UPD Leaderboard

*"Which VLM is reliable?"*
🏆 Welcome to the leaderboard of the **UPD**! *Unsolvable Problem Detection: Evaluating Trustworthiness of Vision Language Models* (**arXiv 2024**)   [![Code](https://img.shields.io/github/stars/AtsuMiyai/UPD.svg?style=social&label=Official)](https://github.com/AtsuMiyai/UPD) 
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
<a href='https://arxiv.org/abs/2403.20331'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
</div>

- **Multiple Senario Evaluation:** We carefully design prompts choices and examine the three senario: (i) base (no instruction), (ii) option (add an additional option), (iii) instruction (add an instruction).
- **Ability-wise Evaluation:** We carefully decompose each benchmark into more than 10 abilities to reveal individual model's strengths and weaknesses.
- **Valuable Insights:** MM-UPD Bench provides multi-perspective insights on trustworthiness and reliablitity for the community.  

Please follow the instructions in [UPD](https://github.com/AtsuMiyai/UPD) to upload the generated `result_dual.json` file here. After clicking the `Submit Eval` button, click the `Refresh` button.
"""


SUBMIT_INTRODUCTION = """# Submit on MM-UPD Benchmark Introduction
    1. Obtain Dual Result JSON File from our [github repository](https://github.com/AtsuMiyai/UPD/tree/main/scripts/inference).
    2. If you want to update model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify LLaVA-1.5-13B's performance, you need to fill in 'LLaVA-1.5-13B' in 'Revision Model Name'.
    3. Please provide the correct link of your model's repository for each submission.  
    4. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.

    Note: The example of the submitted JSON file is this url: [llava1.5_13b_result_dual_detail_submission.json](https://drive.google.com/file/d/1ILYlxcKC_a5Jrm7kyyqeHo0vo3WjkA1V/view?usp=sharing).    
          You need to care about whether (i) the JSON file has the prediction for all data, (ii) the data on all options, "hit_upd", "hit_standard", and "hit" exist.

    ## Submit Example
    If you want to upload LLaVA-1.5-13B's result in the leaderboard, you need to:
    1. Select VLM in 'Model Type'.
    2. Fill in 'LLaVA-1.5-13B' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
    3. Fill in 'LLaVA-1.5-13B' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
    4. Fill in 'https://github.com/haotian-liu/LLaVA' in 'Model Link'.
    5. Fill in '13B' in 'Model size'.
    6. Select 'Vicuna-1.5-13B' in 'LLM Type'.
    7. Fill in 'LLM model' if you select Others for 'LLM Type'.
    8. Select 'AAD', 'IASD', or 'IVQD' in 'UPD_Type'.
    9. Select 'Base', 'Option', or 'Instruction' in 'Question Type'.
    10. Upload results.json.
    11. Click the 'Submit Eval' button.
    12. Click 'Refresh' to obtain the uploaded leaderboard.

    ### If you have any questions or deletion requests, please contact [miyai@cvm.t.u-tokyo.ac.jp](miyai@cvm.t.u-tokyo.ac.jp).
    ### ⚠️ Please do not submit any malicious content.
"""



LEADERBORAD_INFO = """
      MM-UPD Bench is a comprehensive benchmark for evaluating the trustworthiness of Vision Language Models 
      (VLMs) in the context of Unsolvable Problem Detection (UPD). MM-UPD encompasses three benchmarks: 
      MM-AAD, MM-IASD, and MM-IVQD. Each benchmark cover a wide range of abilities. Through these benchmarks, 
      we aim to provide a comprehensive evaluation of VLMs across multiple senarios.
"""


CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@article{miyai2024unsolvable,
  title={{Unsolvable Problem Detection}: Evaluating Trustworthiness of Vision Language Models},
  author={Miyai, Atsuyuki and Yang, Jingkang and Zhang, Jingyang and Ming, Yifei and Yu, Qing and Irie, Go and Li, Yixuan and Li, Hai and Liu, Ziwei and Aizawa, Kiyoharu},
  journal={arXiv preprint arXiv:2403.20331},
  year={2024}
}"""