AtsuMiyai commited on
Commit
3e8020b
·
0 Parent(s):

initial commit

Browse files
Files changed (10) hide show
  1. .gitattributes +55 -0
  2. .gitignore +13 -0
  3. .pre-commit-config.yaml +53 -0
  4. Makefile +13 -0
  5. README.md +45 -0
  6. app.py +702 -0
  7. constants.py +90 -0
  8. pyproject.toml +13 -0
  9. requirements.txt +19 -0
  10. src/utils_display.py +99 -0
.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .vscode/
8
+
9
+ eval-queue/
10
+ eval-results/
11
+ eval-queue-bk/
12
+ eval-results-bk/
13
+ logs/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Demo Leaderboard
3
+ emoji: 🥇
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.4.0
8
+ app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ ---
12
+
13
+ # Start the configuration
14
+
15
+ Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
16
+
17
+ Results files should have the following format and be stored as json files:
18
+ ```json
19
+ {
20
+ "config": {
21
+ "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
22
+ "model_name": "path of the model on the hub: org/model",
23
+ "model_sha": "revision on the hub",
24
+ },
25
+ "results": {
26
+ "task_name": {
27
+ "metric_name": score,
28
+ },
29
+ "task_name2": {
30
+ "metric_name": score,
31
+ }
32
+ }
33
+ }
34
+ ```
35
+
36
+ Request files are created automatically by this tool.
37
+
38
+ If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
39
+
40
+ # Code logic for more complex edits
41
+
42
+ You'll find
43
+ - the main table' columns names and properties in `src/display/utils.py`
44
+ - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
45
+ - teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py ADDED
@@ -0,0 +1,702 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ import re
6
+ import pandas as pd
7
+ import numpy as np
8
+ from collections import defaultdict
9
+ from constants import *
10
+ import os
11
+ from huggingface_hub import Repository
12
+
13
+
14
+ global data_component_aad, data_component_iasd, data_component_ivqd, filter_component
15
+
16
+
17
+ TOKEN = os.environ.get("TOKEN")
18
+
19
+ repo = Repository(local_dir="./download_from_dataset", clone_from="MM-UPD/results_for_leaderboard", repo_type="dataset", use_auth_token=TOKEN)
20
+
21
+ current_directory = os.getcwd()
22
+
23
+
24
+ def validate_model_size(s):
25
+ pattern = r'^\d+B$|^-$'
26
+ if re.match(pattern, s):
27
+ return s
28
+ else:
29
+ return '-'
30
+
31
+
32
+ def upload_file(files):
33
+ file_paths = [file.name for file in files]
34
+ return file_paths
35
+
36
+
37
+ # Accuracy Report
38
+ def report_acc(df, groupd='category', metric_type="dual"):
39
+ assert 'split' in df
40
+ assert groupd in [None, 'category', 'l2-category']
41
+
42
+ res = defaultdict(list)
43
+ res['split'] = ['test']
44
+ if groupd is None:
45
+ if metric_type == "dual":
46
+ res['overall'] = [
47
+ np.mean(df['hit']),
48
+ ]
49
+ elif metric_type == "standard":
50
+ res['overall'] = [
51
+ np.mean(df['hit_standard']),
52
+ ]
53
+ elif metric_type == "upd":
54
+ res['overall'] = [
55
+ np.mean(df['hit_upd']),
56
+ ]
57
+ return pd.DataFrame(res)
58
+
59
+ elif groupd in df:
60
+ abilities = list(set(df[groupd]))
61
+ abilities.sort()
62
+ for ab in abilities:
63
+ sub_df = df[df[groupd] == ab]
64
+ if metric_type == "dual":
65
+ res[ab] = [
66
+ np.mean(sub_df['hit']),
67
+ ]
68
+ elif metric_type == "standard":
69
+ res[ab] = [
70
+ np.mean(sub_df['hit_standard']),
71
+ ]
72
+ elif metric_type == "upd":
73
+ res[ab] = [
74
+ np.mean(sub_df['hit_upd']),
75
+ ]
76
+
77
+ return pd.DataFrame(res)
78
+
79
+
80
+ def eval_result_dual(data_main, metric_type="dual"):
81
+ overall = report_acc(data_main, None, metric_type)
82
+ leaf = report_acc(data_main, 'category', metric_type)
83
+
84
+ overall = round(overall['overall'].values[0] * 100, 1)
85
+ leaf = leaf.iloc[:, 1:].values.flatten().tolist()
86
+ leaf = [round(x * 100, 1) for x in leaf]
87
+
88
+ return overall, leaf
89
+
90
+
91
+ def calculate_score(dual_df_path):
92
+ dual_df = pd.read_excel(dual_df_path)
93
+ overall_dual, leaf_dual = eval_result_dual(dual_df)
94
+ overall_standard, leaf_standard = eval_result_dual(dual_df, metric_type="standard")
95
+ overall_upd, leaf_upd = eval_result_dual(dual_df, metric_type="upd")
96
+
97
+ return overall_dual, overall_standard, overall_upd, leaf_dual
98
+
99
+
100
+ # add the new data into the queue
101
+ def add_queue(base_df, dual_df_path, model_name):
102
+ dual_df = pd.read_excel(dual_df_path)
103
+ base_df[f"{model_name}_prediction_standard"] = dual_df["prediction_standard"]
104
+ base_df[f"{model_name}_hit_standard"] = dual_df["hit_standard"]
105
+ base_df[f"{model_name}_prediction_upd"] = dual_df["prediction_upd"]
106
+ base_df[f"{model_name}_hit_upd"] = dual_df["hit_upd"]
107
+ base_df[f"{model_name}_hit"] = dual_df["hit"]
108
+ return base_df
109
+
110
+
111
+ # check whether the input file is correct or not
112
+ def validity_check(input, UPD_type, question_type):
113
+
114
+ input_df = pd.read_excel(input)
115
+
116
+ # check for the correct data size
117
+ data_num_dict = {"AAD": 820, "IASD": 919, "IVQD": 356}
118
+ assert len(input_df) == data_num_dict[UPD_type], "Different Data Size"
119
+ print("len(input)", len(input_df))
120
+ print("data_num_dict[UPD_type]", data_num_dict[UPD_type])
121
+ # check for missing columns
122
+ column_list = ["hit_upd", "hit_standard", "hit", "prediction_upd", "prediction_standard"]
123
+ assert all(x in input_df.columns for x in column_list), "Column Missing"
124
+
125
+ # check for missing values
126
+ assert not input_df[column_list].isnull().any().any(), "Missing values found in columns"
127
+
128
+ # check for the presence of the correct values
129
+ option_mapping = {"AAD": "None of the above", "IASD": "None of the above", "IVQD": "The image and question are irrelevant."}
130
+ instruction_mapping = {"AAD": "F. None of the above", "IASD": "F. None of the above", "IVQD": "F. The image and question are irrelevant."}
131
+
132
+ input_df["D_upd"] = input_df["D_upd"].fillna("")
133
+
134
+ if question_type == "Base":
135
+ assert not input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]} found in Base"
136
+ assert not input_df["prediction_upd"].str.contains(instruction_mapping[UPD_type]).any(), f"{instruction_mapping[UPD_type]} found in Base"
137
+ elif question_type == "Option":
138
+ assert input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]}not found in Option"
139
+ assert not input_df["prediction_upd"].str.contains(instruction_mapping[UPD_type]).any(), f"{instruction_mapping[UPD_type]} found in Option"
140
+ elif question_type == "Instruction":
141
+ assert not input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]} found in Instruction"
142
+
143
+ return True
144
+
145
+
146
+ def add_new_eval(
147
+ input_file,
148
+ model_type: str,
149
+ model_name_textbox: str,
150
+ revision_name_textbox: str,
151
+ model_link: str,
152
+ model_size: str,
153
+ upd_type: str,
154
+ LLM_type: str,
155
+ LLM_name_textbox: str,
156
+ question_type: str
157
+
158
+ ):
159
+
160
+ if input_file is None:
161
+ warning_text = "Error! Empty file!"
162
+ print(warning_text)
163
+ return warning_text
164
+ else:
165
+ model_size = validate_model_size(model_size)
166
+ if upd_type == 'AAD':
167
+ csv_path = CSV_AAD_RESULT_PATH
168
+ elif upd_type == 'IASD':
169
+ csv_path = CSV_IASD_RESULT_PATH
170
+ elif upd_type == 'IVQD':
171
+ csv_path = CSV_IVQD_RESULT_PATH
172
+
173
+ validity_check(input_file, upd_type, question_type)
174
+
175
+ csv_data = pd.read_csv(csv_path)
176
+
177
+ overall_dual_acc, overall_standard_acc, overall_upd_acc, leaf_dual = calculate_score(input_file)
178
+
179
+ if LLM_type == 'Other':
180
+ LLM_name = LLM_name_textbox
181
+ else:
182
+ LLM_name = LLM_type
183
+
184
+ if revision_name_textbox == '':
185
+ col = csv_data.shape[0]
186
+ model_name = model_name_textbox
187
+ else:
188
+ model_name = revision_name_textbox
189
+ model_name_list = csv_data['Model']
190
+ name_list = [name.split(']')[0][1:] for name in model_name_list]
191
+ if revision_name_textbox not in name_list:
192
+ col = csv_data.shape[0]
193
+ else:
194
+ col = name_list.index(revision_name_textbox)
195
+
196
+ if model_link == '':
197
+ model_name = model_name # no url
198
+ else:
199
+ model_name = '[' + model_name + '](' + model_link + ')'
200
+
201
+ # add new data
202
+ new_data = [
203
+ model_type,
204
+ model_name,
205
+ LLM_name,
206
+ model_size,
207
+ question_type,
208
+ overall_dual_acc,
209
+ overall_standard_acc,
210
+ overall_upd_acc,
211
+ ]
212
+ new_data += leaf_dual
213
+
214
+ # If the same data already exists, return an error.
215
+ if new_data in csv_data.values.tolist():
216
+ warning_text = "Error! The same data already exists!"
217
+ print(warning_text)
218
+ return warning_text
219
+ # If the same model name already exists, return an error.
220
+ elif new_data[:5] in csv_data.values.tolist():
221
+ warning_text = "Error! The same data already exists! Please fill revision_name."
222
+ print(warning_text)
223
+ return warning_text
224
+
225
+ csv_data.loc[col] = new_data
226
+ csv_data = csv_data.to_csv(csv_path, index=False)
227
+
228
+ absolute_result_path = os.path.abspath(csv_path)
229
+ if not os.path.exists(absolute_result_path):
230
+ raise FileNotFoundError(f"File {absolute_result_path} not found")
231
+
232
+ repo.git_pull()
233
+ repo.git_add(absolute_result_path)
234
+
235
+ csv_queue_path = os.path.join(CSV_QUEUE_DIR, f"detail_results_{upd_type.lower()}_{question_type.lower()}.csv")
236
+ base_data = pd.read_csv(csv_queue_path)
237
+
238
+ base_data = add_queue(base_data, input_file, model_name)
239
+ base_data.to_csv(csv_queue_path, index=False)
240
+
241
+ absolute_queue_path = os.path.abspath(csv_queue_path)
242
+ if not os.path.exists(absolute_queue_path):
243
+ raise FileNotFoundError(f"File {absolute_queue_path} not found")
244
+
245
+ repo.git_add(absolute_queue_path)
246
+ repo.git_commit(f"add {model_name} results in {question_type}")
247
+
248
+ repo.git_push()
249
+
250
+ return 0
251
+
252
+
253
+ def get_baseline_aad_df():
254
+ repo.git_pull()
255
+ df = pd.read_csv(CSV_AAD_RESULT_PATH)
256
+ df = df.sort_values(by="Overall Dual Acc.", ascending=False)
257
+ present_columns = MODEL_INFO + checkbox_aad_group.value
258
+ df = df[present_columns]
259
+ return df
260
+
261
+
262
+ def get_all_aad_df():
263
+ repo.git_pull()
264
+ df = pd.read_csv(CSV_AAD_RESULT_PATH)
265
+ df = df.sort_values(by="Overall Dual Acc.", ascending=False)
266
+ return df
267
+
268
+
269
+ def get_baseline_iasd_df():
270
+ repo.git_pull()
271
+ df = pd.read_csv(CSV_IASD_RESULT_PATH)
272
+ df = df.sort_values(by="Overall Dual Acc.", ascending=False)
273
+ present_columns = MODEL_INFO + checkbox_iasd_group.value
274
+ df = df[present_columns]
275
+ return df
276
+
277
+
278
+ def get_all_iasd_df():
279
+ repo.git_pull()
280
+ df = pd.read_csv(CSV_IASD_RESULT_PATH)
281
+ df = df.sort_values(by="Overall Dual Acc.", ascending=False)
282
+ return df
283
+
284
+
285
+ def get_baseline_ivqd_df():
286
+ repo.git_pull()
287
+ df = pd.read_csv(CSV_IVQD_RESULT_PATH)
288
+ df = df.sort_values(by="Overall Dual Acc.", ascending=False)
289
+ present_columns = MODEL_INFO + checkbox_ivqd_group.value
290
+ df = df[present_columns]
291
+ return df
292
+
293
+
294
+ def get_all_ivqd_df():
295
+ repo.git_pull()
296
+ df = pd.read_csv(CSV_IVQD_RESULT_PATH)
297
+ df = df.sort_values(by="Overall Dual Acc.", ascending=False)
298
+ return df
299
+
300
+
301
+ block = gr.Blocks()
302
+
303
+
304
+ with block:
305
+ gr.Markdown(
306
+ LEADERBORAD_INTRODUCTION
307
+ )
308
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
309
+ # table mmupd bench
310
+ with gr.TabItem("🏅 MM-AAD Benchmark", elem_id="mmaad-benchmark-tab-table", id=1):
311
+ with gr.Row():
312
+ with gr.Accordion("Citation", open=False):
313
+ citation_button = gr.Textbox(
314
+ value=CITATION_BUTTON_TEXT,
315
+ label=CITATION_BUTTON_LABEL,
316
+ elem_id="citation-button",
317
+ show_copy_button=True,
318
+ )
319
+
320
+ # selection for column part:
321
+ checkbox_aad_group = gr.CheckboxGroup(
322
+ choices=TASK_AAD_INFO,
323
+ value=AVG_INFO,
324
+ label="Evaluation Dimension",
325
+ interactive=True,
326
+ ) # user can select the evaluation dimension
327
+
328
+ with gr.Row():
329
+ # selection for model size part:
330
+ model_size = gr.CheckboxGroup(
331
+ choices=MODEL_SIZE,
332
+ value=MODEL_SIZE,
333
+ label="Model Size",
334
+ interactive=True,
335
+ )
336
+
337
+ # selection for model size part:
338
+ question_type = gr.CheckboxGroup(
339
+ choices=QUESTION_TYPE,
340
+ value=QUESTION_TYPE,
341
+ label="Question Type",
342
+ interactive=True,
343
+ )
344
+
345
+ baseline_value = get_baseline_aad_df()
346
+ baseline_header = MODEL_INFO + checkbox_aad_group.value
347
+ baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_aad_group.value)
348
+
349
+ data_component_aad = gr.components.Dataframe(
350
+ value=baseline_value,
351
+ headers=baseline_header,
352
+ type="pandas",
353
+ datatype=baseline_datatype,
354
+ interactive=False,
355
+ visible=True,
356
+ )
357
+
358
+ def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns):
359
+
360
+ updated_data = get_all_aad_df()
361
+ # model_size & question_type:
362
+
363
+ def custom_filter(row, model_size_filters, question_type_filters):
364
+ model_size = row['Model Size']
365
+ question_type = row['Question Type']
366
+ model_size = model_size.upper()
367
+
368
+ if model_size == '-':
369
+ size_filter = '-' in model_size_filters
370
+ elif 'B' in model_size:
371
+ size = float(model_size.replace('B', ''))
372
+ size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
373
+ else:
374
+ size_filter = False
375
+
376
+ question_type_filter = question_type in question_type_filters
377
+
378
+ return size_filter and question_type_filter
379
+
380
+ mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type)
381
+ updated_data = updated_data[mask]
382
+
383
+ # columns:
384
+ selected_columns = [item for item in TASK_AAD_INFO if item in selected_columns]
385
+ present_columns = MODEL_INFO + selected_columns
386
+ updated_data = updated_data[present_columns]
387
+ updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
388
+ updated_headers = present_columns
389
+ update_datatype = [DATA_AAD_TITILE_TYPE[COLUMN_AAD_NAMES.index(x)] for x in updated_headers]
390
+
391
+ filter_component = gr.components.Dataframe(
392
+ value=updated_data,
393
+ headers=updated_headers,
394
+ type="pandas",
395
+ datatype=update_datatype,
396
+ interactive=False,
397
+ visible=True,
398
+ )
399
+ return filter_component
400
+
401
+
402
+ model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
403
+ question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
404
+ checkbox_aad_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
405
+
406
+
407
+ with gr.TabItem("🏅 MM-IASD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=2):
408
+ with gr.Row():
409
+ with gr.Accordion("Citation", open=False):
410
+ citation_button = gr.Textbox(
411
+ value=CITATION_BUTTON_TEXT,
412
+ label=CITATION_BUTTON_LABEL,
413
+ elem_id="citation-button",
414
+ show_copy_button=True,
415
+ )
416
+
417
+ checkbox_iasd_group = gr.CheckboxGroup(
418
+ choices=TASK_IASD_INFO,
419
+ value=AVG_INFO,
420
+ label="Evaluation Dimension",
421
+ interactive=True,
422
+ ) # user can select the evaluation dimension
423
+
424
+ with gr.Row():
425
+ # selection for model size part:
426
+ model_size = gr.CheckboxGroup(
427
+ choices=MODEL_SIZE,
428
+ value=MODEL_SIZE,
429
+ label="Model Size",
430
+ interactive=True,
431
+ )
432
+
433
+ # selection for model size part:
434
+ question_type = gr.CheckboxGroup(
435
+ choices=QUESTION_TYPE,
436
+ value=QUESTION_TYPE,
437
+ label="Question Type",
438
+ interactive=True,
439
+ )
440
+
441
+ baseline_value = get_baseline_iasd_df()
442
+ baseline_header = MODEL_INFO + checkbox_iasd_group.value
443
+ baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_iasd_group.value)
444
+
445
+ data_component_iasd = gr.components.Dataframe(
446
+ value=baseline_value,
447
+ headers=baseline_header,
448
+ type="pandas",
449
+ datatype=baseline_datatype,
450
+ interactive=False,
451
+ visible=True,
452
+ )
453
+
454
+ def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns):
455
+
456
+ updated_data = get_all_iasd_df()
457
+
458
+ def custom_filter(row, model_size_filters, question_type_filters):
459
+ model_size = row['Model Size']
460
+ question_type = row['Question Type']
461
+ model_size = model_size.upper()
462
+
463
+ if model_size == '-':
464
+ size_filter = '-' in model_size_filters
465
+ elif 'B' in model_size:
466
+ size = float(model_size.replace('B', ''))
467
+ size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
468
+ else:
469
+ size_filter = False
470
+
471
+ question_type_filter = question_type in question_type_filters
472
+
473
+ return size_filter and question_type_filter
474
+
475
+ mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type)
476
+ updated_data = updated_data[mask]
477
+
478
+ # columns:
479
+ selected_columns = [item for item in TASK_IASD_INFO if item in selected_columns]
480
+ present_columns = MODEL_INFO + selected_columns
481
+ updated_data = updated_data[present_columns]
482
+ updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
483
+ updated_headers = present_columns
484
+ update_datatype = [DATA_IASD_TITILE_TYPE[COLUMN_IASD_NAMES.index(x)] for x in updated_headers]
485
+
486
+ filter_component = gr.components.Dataframe(
487
+ value=updated_data,
488
+ headers=updated_headers,
489
+ type="pandas",
490
+ datatype=update_datatype,
491
+ interactive=False,
492
+ visible=True,
493
+ )
494
+ return filter_component
495
+
496
+ model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
497
+ question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
498
+ checkbox_iasd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
499
+
500
+ # Table 3
501
+ with gr.TabItem("🏅 MM-IVQD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=3):
502
+ with gr.Row():
503
+ with gr.Accordion("Citation", open=False):
504
+ citation_button = gr.Textbox(
505
+ value=CITATION_BUTTON_TEXT,
506
+ label=CITATION_BUTTON_LABEL,
507
+ elem_id="citation-button",
508
+ show_copy_button=True,
509
+ )
510
+
511
+ # selection for column part:
512
+ checkbox_ivqd_group = gr.CheckboxGroup(
513
+ choices=TASK_IVQD_INFO,
514
+ value=AVG_INFO,
515
+ label="Evaluation Dimension",
516
+ interactive=True,
517
+ ) # user can select the evaluation dimension
518
+
519
+ with gr.Row():
520
+ # selection for model size part:
521
+ model_size = gr.CheckboxGroup(
522
+ choices=MODEL_SIZE,
523
+ value=MODEL_SIZE,
524
+ label="Model Size",
525
+ interactive=True,
526
+ )
527
+
528
+ # selection for model size part:
529
+ question_type = gr.CheckboxGroup(
530
+ choices=QUESTION_TYPE,
531
+ value=QUESTION_TYPE,
532
+ label="Question Type",
533
+ interactive=True,
534
+ )
535
+
536
+ baseline_value = get_baseline_ivqd_df()
537
+ baseline_header = MODEL_INFO + checkbox_ivqd_group.value
538
+ baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_ivqd_group.value)
539
+
540
+ data_component_ivqd = gr.components.Dataframe(
541
+ value=baseline_value,
542
+ headers=baseline_header,
543
+ type="pandas",
544
+ datatype=baseline_datatype,
545
+ interactive=False,
546
+ visible=True,
547
+ )
548
+
549
+ def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns):
550
+
551
+ updated_data = get_all_ivqd_df()
552
+
553
+ def custom_filter(row, model_size_filters, question_type_filters):
554
+ model_size = row['Model Size']
555
+ question_type = row['Question Type']
556
+ model_size = model_size.upper()
557
+
558
+ if model_size == '-':
559
+ size_filter = '-' in model_size_filters
560
+ elif 'B' in model_size:
561
+ size = float(model_size.replace('B', ''))
562
+ size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
563
+ else:
564
+ size_filter = False
565
+
566
+ question_type_filter = question_type in question_type_filters
567
+
568
+ return size_filter and question_type_filter
569
+
570
+ mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type)
571
+ updated_data = updated_data[mask]
572
+
573
+ selected_columns = [item for item in TASK_IVQD_INFO if item in selected_columns]
574
+ present_columns = MODEL_INFO + selected_columns
575
+ updated_data = updated_data[present_columns]
576
+ updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
577
+ updated_headers = present_columns
578
+ update_datatype = [DATA_IVQD_TITILE_TYPE[COLUMN_IVQD_NAMES.index(x)] for x in updated_headers]
579
+
580
+ filter_component = gr.components.Dataframe(
581
+ value=updated_data,
582
+ headers=updated_headers,
583
+ type="pandas",
584
+ datatype=update_datatype,
585
+ interactive=False,
586
+ visible=True,
587
+ )
588
+ return filter_component
589
+
590
+ model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
591
+ question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
592
+ checkbox_ivqd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
593
+
594
+
595
+ # table 4
596
+ with gr.TabItem("📝 About", elem_id="mmupd-benchmark-tab-table", id=4):
597
+ gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
598
+
599
+ # table 5
600
+ with gr.TabItem("🚀 Submit here! ", elem_id="mmupd-benchmark-tab-table", id=5):
601
+ gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
602
+
603
+ with gr.Row():
604
+ gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
605
+
606
+ with gr.Row():
607
+ gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
608
+
609
+ with gr.Row():
610
+ with gr.Column():
611
+ model_type = gr.Dropdown(
612
+ choices=["VLM", "LLM"],
613
+ label="Model type",
614
+ multiselect=False,
615
+ value="VLM",
616
+ interactive=True,
617
+ )
618
+ model_name_textbox = gr.Textbox(
619
+ label="Model name", placeholder="LLaMA-7B"
620
+ )
621
+ revision_name_textbox = gr.Textbox(
622
+ label="Revision Model Name", placeholder="LLaMA-7B"
623
+ )
624
+
625
+ model_link = gr.Textbox(
626
+ label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
627
+ )
628
+
629
+ model_size = gr.Textbox(
630
+ label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
631
+ )
632
+
633
+ with gr.Column():
634
+ LLM_type = gr.Dropdown(
635
+ choices=["Vicuna-1.5-7B", "Vicuna-1.5-13B", "Flan-T5-XL", "LLaMA-7B", "Llama-13B", "Llama-3-8B", "Llama-3-70B", "Yi-34B", "Mistral-7B", "Other"],
636
+ label="LLM type",
637
+ multiselect=False,
638
+ value="Vicuna-1.5-13B",
639
+ interactive=True,
640
+ )
641
+
642
+ LLM_name_textbox = gr.Textbox(
643
+ label="LLM model (Required for Other)",
644
+ placeholder="GPT-4",
645
+ )
646
+
647
+ upd_type = gr.Dropdown(
648
+ choices=[
649
+ "AAD",
650
+ "IASD",
651
+ "IVQD",
652
+ ],
653
+ label="UPD type",
654
+ multiselect=False,
655
+ value="AAD",
656
+ interactive=True,
657
+ )
658
+
659
+ question_type = gr.Dropdown(
660
+ choices=QUESTION_TYPE,
661
+ label="Question Type",
662
+ multiselect=False,
663
+ value=QUESTION_TYPE[0],
664
+ interactive=True,
665
+ )
666
+
667
+ with gr.Column():
668
+
669
+ input_file = gr.components.File(label="Click to Upload a Dual Evaluation File", file_count="single", type='binary')
670
+ submit_button = gr.Button("Submit Eval")
671
+
672
+ submission_result = gr.Markdown()
673
+ submit_button.click(
674
+ add_new_eval,
675
+ inputs = [
676
+ input_file,
677
+ model_type,
678
+ model_name_textbox,
679
+ revision_name_textbox,
680
+ model_link,
681
+ model_size,
682
+ upd_type,
683
+ LLM_type,
684
+ LLM_name_textbox,
685
+ question_type
686
+ ],
687
+ )
688
+
689
+ def refresh_data():
690
+ value1 = get_baseline_aad_df()
691
+ value2 = get_baseline_iasd_df()
692
+ value3 = get_baseline_ivqd_df()
693
+
694
+ return value1, value2, value3
695
+
696
+ with gr.Row():
697
+ data_run = gr.Button("Refresh")
698
+ data_run.click(
699
+ refresh_data, outputs=[data_component_aad, data_component_iasd, data_component_ivqd]
700
+ )
701
+
702
+ block.launch()
constants.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this is .py for store constants
2
+ MODEL_INFO = ["Model Type", "Model", "Language Model", "Question Type"]
3
+ MODEL_SIZE = ["<10B", ">=10B", "-"]
4
+ QUESTION_TYPE = ["Base", "Option", "Instruction"]
5
+ LEADERBOARD_VERSION = ["Version1"]
6
+ TASK_AAD_INFO = ["Overall Dual Acc.", "Overall Standard Acc.", "Overall UPD Acc.", "action_recognition", "attribute_comparison", "attribute_recognition", "celebrity_recognition", "function_reasoning", "future_prediction", "identity_reasoning", "image_emotion", "image_scene", "image_style", "image_topic", "nature_relation", "object_localization", "ocr", "physical_property_reasoning", "physical_relation", "social_relation", "structuralized_imagetext_understanding"]
7
+ TASK_IASD_INFO = ["Overall Dual Acc.", "Overall Standard Acc.", "Overall UPD Acc.", "action_recognition", "attribute_comparison", "attribute_recognition", "celebrity_recognition", "function_reasoning", "future_prediction", "identity_reasoning", "image_emotion", "image_scene", "image_style", "image_topic", "nature_relation", "object_localization", "ocr", "physical_property_reasoning", "physical_relation", "social_relation", "structuralized_imagetext_understanding"]
8
+ TASK_IVQD_INFO = ["Overall Dual Acc.", "Overall Standard Acc.", "Overall UPD Acc.", "action_recognition", "attribute_comparison", "attribute_recognition", "celebrity_recognition", "function_reasoning", "image_scene", "nature_relation", "object_localization", "ocr", "physical_property_reasoning", "physical_relation", "social_relation"]
9
+
10
+ AVG_INFO = ["Overall Dual Acc."]
11
+
12
+ DATA_AAD_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
13
+ DATA_IASD_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
14
+ DATA_IVQD_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
15
+
16
+ CSV_AAD_RESULT_PATH = "./download_from_dataset/results/result_aad.csv"
17
+ CSV_IASD_RESULT_PATH = "./download_from_dataset/results/result_iasd.csv"
18
+ CSV_IVQD_RESULT_PATH = "./download_from_dataset/results/result_ivqd.csv"
19
+
20
+ CSV_QUEUE_DIR = "./download_from_dataset/queue/"
21
+
22
+ COLUMN_AAD_NAMES = MODEL_INFO + TASK_AAD_INFO
23
+ COLUMN_IASD_NAMES = MODEL_INFO + TASK_IASD_INFO
24
+ COLUMN_IVQD_NAMES = MODEL_INFO + TASK_IVQD_INFO
25
+
26
+ LEADERBORAD_VERSION = ["MM-AAD", "MM-IASD", "MM-IVQD"]
27
+
28
+
29
+ LEADERBORAD_INTRODUCTION = """
30
+ # UPD Leaderboard
31
+
32
+ *"Which VLM is reliable?"*
33
+ 🏆 Welcome to the leaderboard of the **UPD**! *Unsolvable Problem Detection: Evaluating Trustworthiness of Vision Language Models* (**arXiv 2024**) [![Code](https://img.shields.io/github/stars/AtsuMiyai/UPD.svg?style=social&label=Official)](https://github.com/AtsuMiyai/UPD)
34
+ <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
35
+ <a href='https://arxiv.org/abs/2403.20331'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
36
+ </div>
37
+
38
+ - **Multiple Senario Evaluation:** We carefully design prompts choices and examine the three senario: (i) base (no instruction), (ii) option (add an additional option), (iii) instruction (add an instruction).
39
+ - **Ability-wise Evaluation:** We carefully decompose each benchmark into more than 10 abilities to reveal individual model's strengths and weaknesses.
40
+ - **Valuable Insights:** MM-UPD Bench provides multi-perspective insights on trustworthiness and reliablitity for the community.
41
+
42
+ Please follow the instructions in [UPD](https://github.com/AtsuMiyai/UPD) to upload the generated `result_dual.xlsx` file here. After clicking the `Submit Eval` button, click the `Refresh` button.
43
+ """
44
+
45
+
46
+ SUBMIT_INTRODUCTION = """# Submit on MM-UPD Benchmark Introduction
47
+ 1. Obtain Dual Result Excel File from our [github repository](https://github.com/AtsuMiyai/UPD/tree/main/scripts/inference).
48
+ 2. If you want to update model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify LLaVA-1.5-13B's performance, you need to fill in 'LLaVA-1.5-13B' in 'Revision Model Name'.
49
+ 3. Please provide the correct link of your model's repository for each submission.
50
+ 4. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
51
+
52
+ Note: The example of the submitted excel file is this url: [llava1.5_13b_result_dual.xlsx](https://docs.google.com/spreadsheets/d/1Se0_iYHr6aktHFnCzwArU1ExTjL-UmeO/edit?usp=sharing&ouid=103623120947968158097&rtpof=true&sd=true).
53
+ You need to care about whether (i) the excel file has the prediction for all data, (ii) the columns on hit_upd, hit_standard, and hit exist.
54
+
55
+ ## Submit Example
56
+ If you want to upload LLaVA-1.5-13B's result in the leaderboard, you need to:
57
+ 1. Select VLM in 'Model Type'.
58
+ 2. Fill in 'LLaVA-1.5-13B' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
59
+ 3. Fill in 'LLaVA-1.5-13B' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
60
+ 4. Fill in 'https://github.com/haotian-liu/LLaVA' in 'Model Link'.
61
+ 5. Fill in '13B' in 'Model size'.
62
+ 6. Select 'Vicuna-1.5-13B' in 'LLM Type'.
63
+ 7. Fill in 'LLM model' if you select Others for 'LLM Type'.
64
+ 8. Select 'AAD', 'IASD', or 'IVQD' in 'UPD_Type'.
65
+ 9. Select 'Base', 'Option', or 'Instruction' in 'Question Type'.
66
+ 10. Upload results.xlsx.
67
+ 11. Click the 'Submit Eval' button.
68
+ 12. Click 'Refresh' to obtain the uploaded leaderboard.
69
+
70
+ ### If you have any questions or deletion requests, please contact [miyai@cvm.t.u-tokyo.ac.jp](miyai@cvm.t.u-tokyo.ac.jp).
71
+ ### ⚠️ Please do not submit any malicious content.
72
+ """
73
+
74
+
75
+
76
+ LEADERBORAD_INFO = """
77
+ MM-UPD Bench is a comprehensive benchmark for evaluating the trustworthiness of Vision Language Models
78
+ (VLMs) in the context of Unsolvable Problem Detection (UPD). MM-UPD encompasses three benchmarks:
79
+ MM-AAD, MM-IASD, and MM-IVQD. Each benchmark cover a wide range of abilities. Through these benchmarks,
80
+ we aim to provide a comprehensive evaluation of VLMs across multiple senarios.
81
+ """
82
+
83
+
84
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
85
+ CITATION_BUTTON_TEXT = r"""@article{miyai2024unsolvable,
86
+ title={{Unsolvable Problem Detection}: Evaluating Trustworthiness of Vision Language Models},
87
+ author={Miyai, Atsuyuki and Yang, Jingkang and Zhang, Jingyang and Ming, Yifei and Yu, Qing and Irie, Go and Li, Yixuan and Li, Hai and Liu, Ziwei and Aizawa, Kiyoharu},
88
+ journal={arXiv preprint arXiv:2403.20331},
89
+ year={2024}
90
+ }"""
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler==3.10.1
2
+ black==23.11.0
3
+ click==8.1.3
4
+ datasets==2.14.5
5
+ gradio==4.4.0
6
+ gradio_client==0.7.0
7
+ huggingface-hub>=0.23.2
8
+ matplotlib==3.7.1
9
+ numpy==1.24.2
10
+ pandas==2.0.0
11
+ python-dateutil==2.8.2
12
+ requests==2.28.2
13
+ tqdm==4.65.0
14
+ transformers==4.35.2
15
+ tokenizers>=0.15.0
16
+ git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
+ accelerate==0.24.1
18
+ sentencepiece
19
+ openpyxl
src/utils_display.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ # These classes are for user facing column names, to avoid having to change them
4
+ # all around the code when a modif is needed
5
+ @dataclass
6
+ class ColumnContent:
7
+ name: str
8
+ type: str
9
+ displayed_by_default: bool
10
+ hidden: bool = False
11
+
12
+ def fields(raw_class):
13
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
14
+
15
+ @dataclass(frozen=True)
16
+ class AutoEvalColumn: # Auto evals column
17
+ model_type_symbol = ColumnContent("T", "str", True)
18
+ model = ColumnContent("Model", "markdown", True)
19
+ average = ColumnContent("Average ⬆️", "number", True)
20
+ arc = ColumnContent("ARC", "number", True)
21
+ hellaswag = ColumnContent("HellaSwag", "number", True)
22
+ mmlu = ColumnContent("MMLU", "number", True)
23
+ truthfulqa = ColumnContent("TruthfulQA", "number", True)
24
+ model_type = ColumnContent("Type", "str", False)
25
+ precision = ColumnContent("Precision", "str", False, True)
26
+ license = ColumnContent("Hub License", "str", False)
27
+ params = ColumnContent("#Params (B)", "number", False)
28
+ likes = ColumnContent("Hub ❤️", "number", False)
29
+ revision = ColumnContent("Model sha", "str", False, False)
30
+ dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
31
+
32
+ @dataclass(frozen=True)
33
+ class EloEvalColumn: # Elo evals column
34
+ model = ColumnContent("Model", "markdown", True)
35
+ gpt4 = ColumnContent("GPT-4 (all)", "number", True)
36
+ human_all = ColumnContent("Human (all)", "number", True)
37
+ human_instruct = ColumnContent("Human (instruct)", "number", True)
38
+ human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class EvalQueueColumn: # Queue column
43
+ model = ColumnContent("model", "markdown", True)
44
+ revision = ColumnContent("revision", "str", True)
45
+ private = ColumnContent("private", "bool", True)
46
+ precision = ColumnContent("precision", "bool", True)
47
+ weight_type = ColumnContent("weight_type", "str", "Original")
48
+ status = ColumnContent("status", "str", True)
49
+
50
+ LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
51
+
52
+
53
+ KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
54
+ VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
55
+ OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
56
+ DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
57
+ MODEL_PAGE = "https://huggingface.co/models"
58
+ LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
59
+ VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
60
+ ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
61
+
62
+
63
+ def model_hyperlink(link, model_name):
64
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
65
+
66
+
67
+ def make_clickable_model(model_name):
68
+ link = f"https://huggingface.co/{model_name}"
69
+
70
+ if model_name in LLAMAS:
71
+ link = LLAMA_LINK
72
+ model_name = model_name.split("/")[1]
73
+ elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
74
+ link = VICUNA_LINK
75
+ model_name = "stable-vicuna-13b"
76
+ elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
77
+ link = ALPACA_LINK
78
+ model_name = "alpaca-13b"
79
+ if model_name == "dolly-12b":
80
+ link = DOLLY_LINK
81
+ elif model_name == "vicuna-13b":
82
+ link = VICUNA_LINK
83
+ elif model_name == "koala-13b":
84
+ link = KOALA_LINK
85
+ elif model_name == "oasst-12b":
86
+ link = OASST_LINK
87
+ #else:
88
+ # link = MODEL_PAGE
89
+
90
+ return model_hyperlink(link, model_name)
91
+
92
+ def styled_error(error):
93
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
94
+
95
+ def styled_warning(warn):
96
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
97
+
98
+ def styled_message(message):
99
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"