Spaces:
Running
Running
Commit
·
94bd921
1
Parent(s):
730f0f9
feat: adapt to MMIE
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +0 -1
- README.md +2 -10
- app.py +112 -145
- evals/.gitattributes +0 -55
- evals/README.md +0 -6
- evals/mjbench-results/detailed-results/AestheticsPredictor.json +0 -47
- evals/mjbench-results/detailed-results/BLIP-v2.json +0 -47
- evals/mjbench-results/detailed-results/CLIP-v2.json +0 -47
- evals/mjbench-results/detailed-results/Claude 3 Opus.json +0 -47
- evals/mjbench-results/detailed-results/GPT-4-vision.json +0 -47
- evals/mjbench-results/detailed-results/GPT-4o.json +0 -47
- evals/mjbench-results/detailed-results/Gemini Ultra.json +0 -47
- evals/mjbench-results/detailed-results/HPS-v2.1.json +0 -47
- evals/mjbench-results/detailed-results/Idefics2-8b.json +0 -47
- evals/mjbench-results/detailed-results/ImageReward.json +0 -47
- evals/mjbench-results/detailed-results/Instructblip-7b.json +0 -47
- evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json +0 -47
- evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json +0 -47
- evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json +0 -47
- evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json +0 -47
- evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json +0 -35
- evals/mjbench-results/detailed-results/MiniGPT4-v2.json +0 -47
- evals/mjbench-results/detailed-results/PickScore-v1.json +0 -47
- evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json +0 -47
- evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json +0 -47
- evals/mjbench-results/detailed-results/Qwen-VL-Chat.json +0 -47
- evals/mjbench-results/overall-results/AestheticsPredictor.json +0 -12
- evals/mjbench-results/overall-results/BLIP-v2.json +0 -12
- evals/mjbench-results/overall-results/CLIP-v2.json +0 -12
- evals/mjbench-results/overall-results/Claude 3 Opus.json +0 -12
- evals/mjbench-results/overall-results/GPT-4-vision.json +0 -12
- evals/mjbench-results/overall-results/GPT-4o.json +0 -12
- evals/mjbench-results/overall-results/Gemini Ultra.json +0 -12
- evals/mjbench-results/overall-results/HPS-v2.1.json +0 -12
- evals/mjbench-results/overall-results/Idefics2-8b.json +0 -12
- evals/mjbench-results/overall-results/ImageReward.json +0 -12
- evals/mjbench-results/overall-results/Instructblip-7b.json +0 -12
- evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json +0 -12
- evals/mjbench-results/overall-results/LLaVA-1.5-13b.json +0 -12
- evals/mjbench-results/overall-results/LLaVA-1.5-7b.json +0 -12
- evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json +0 -12
- evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json +0 -12
- evals/mjbench-results/overall-results/MiniGPT4-v2.json +0 -12
- evals/mjbench-results/overall-results/PickScore-v1.json +0 -12
- evals/mjbench-results/overall-results/Prometheus-Vision-13b.json +0 -12
- evals/mjbench-results/overall-results/Prometheus-Vision-7b.json +0 -12
- evals/mjbench-results/overall-results/Qwen-VL-Chat.json +0 -12
- src/about.py +4 -5
- src/envs.py +3 -3
- src/logo.png +0 -0
.gitattributes
CHANGED
@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
36 |
-
mj-bench-logo.png filter=lfs diff=lfs merge=lfs -text
|
|
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🥇
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
@@ -45,13 +45,5 @@ You'll find
|
|
45 |
|
46 |
## Citation
|
47 |
```
|
48 |
-
|
49 |
-
title={MJ-Bench: Is Your Multimodal Reward Model Really a Good Judge for Text-to-Image Generation?},
|
50 |
-
author={Zhaorun Chen and Yichao Du and Zichen Wen and Yiyang Zhou and Chenhang Cui and Zhenzhen Weng and Haoqin Tu and Chaoqi Wang and Zhengwei Tong and Qinglan Huang and Canyu Chen and Qinghao Ye and Zhihong Zhu and Yuqing Zhang and Jiawei Zhou and Zhuokai Zhao and Rafael Rafailov and Chelsea Finn and Huaxiu Yao},
|
51 |
-
year={2024},
|
52 |
-
eprint={2407.04842},
|
53 |
-
archivePrefix={arXiv},
|
54 |
-
primaryClass={cs.CV},
|
55 |
-
url={https://arxiv.org/abs/2407.04842},
|
56 |
-
}
|
57 |
```
|
|
|
1 |
---
|
2 |
+
title: MMIE Leaderboard
|
3 |
emoji: 🥇
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
|
|
45 |
|
46 |
## Citation
|
47 |
```
|
48 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
```
|
app.py
CHANGED
@@ -7,7 +7,6 @@ import numpy as np
|
|
7 |
from pathlib import Path
|
8 |
from apscheduler.schedulers.background import BackgroundScheduler
|
9 |
from huggingface_hub import snapshot_download
|
10 |
-
from datasets import load_dataset
|
11 |
|
12 |
|
13 |
from src.about import (
|
@@ -20,19 +19,19 @@ from src.about import (
|
|
20 |
ABOUT_TEXT
|
21 |
)
|
22 |
from src.display.css_html_js import custom_css
|
23 |
-
from src.display.utils import (
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
)
|
36 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
37 |
|
38 |
try:
|
@@ -76,7 +75,7 @@ PERSPECTIVE_COUNTS= {
|
|
76 |
|
77 |
|
78 |
|
79 |
-
META_DATA = ['Model'
|
80 |
|
81 |
|
82 |
|
@@ -84,36 +83,36 @@ def restart_space():
|
|
84 |
API.restart_space(repo_id=REPO_ID)
|
85 |
|
86 |
|
87 |
-
color_map = {
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
}
|
95 |
-
def color_model_type_column(df, color_map):
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
|
116 |
-
|
117 |
|
118 |
def regex_table(dataframe, regex, filter_button, style=True):
|
119 |
"""
|
@@ -127,14 +126,10 @@ def regex_table(dataframe, regex, filter_button, style=True):
|
|
127 |
# if filter_button, remove all rows with "ai2" in the model name
|
128 |
update_scores = False
|
129 |
if isinstance(filter_button, list) or isinstance(filter_button, str):
|
130 |
-
if "
|
131 |
-
dataframe = dataframe[~dataframe["Model Type"].str.contains("
|
132 |
-
if "
|
133 |
-
dataframe = dataframe[~dataframe["Model Type"].str.contains("
|
134 |
-
if "Closesource VLM" not in filter_button:
|
135 |
-
dataframe = dataframe[~dataframe["Model Type"].str.contains("Closesource VLM", case=False, na=False)]
|
136 |
-
if "Others" not in filter_button:
|
137 |
-
dataframe = dataframe[~dataframe["Model Type"].str.contains("Others", case=False, na=False)]
|
138 |
# Filter the dataframe such that 'model' contains any of the regex patterns
|
139 |
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
|
140 |
|
@@ -143,9 +138,9 @@ def regex_table(dataframe, regex, filter_button, style=True):
|
|
143 |
# replace column '' with count/rank
|
144 |
data.insert(0, '', range(1, 1 + len(data)))
|
145 |
|
146 |
-
if style:
|
147 |
-
|
148 |
-
|
149 |
|
150 |
return data
|
151 |
|
@@ -164,27 +159,6 @@ def get_leaderboard_results(results_path):
|
|
164 |
df.reset_index(drop=True, inplace=True)
|
165 |
return df
|
166 |
|
167 |
-
def avg_all_subset(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, subset_counts=SUBSET_COUNTS):
|
168 |
-
new_df = orig_df.copy()[meta_data + columns_name]
|
169 |
-
|
170 |
-
# Filter the dictionary to include only the counts relevant to the specified columns
|
171 |
-
new_subset_counts = {col: subset_counts[col] for col in columns_name}
|
172 |
-
|
173 |
-
# Calculate the weights for each subset
|
174 |
-
total_count = sum(new_subset_counts.values())
|
175 |
-
weights = {subset: count / total_count for subset, count in new_subset_counts.items()}
|
176 |
-
|
177 |
-
# Calculate the weight_avg value for each row
|
178 |
-
def calculate_weighted_avg(row):
|
179 |
-
weighted_sum = sum(row[col] * weights[col] for col in columns_name)
|
180 |
-
return weighted_sum
|
181 |
-
|
182 |
-
new_df["Overall Score"] = new_df.apply(calculate_weighted_avg, axis=1)
|
183 |
-
|
184 |
-
cols = meta_data + ["Overall Score"] + columns_name
|
185 |
-
new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
|
186 |
-
return new_df
|
187 |
-
|
188 |
|
189 |
def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, perspective_counts=PERSPECTIVE_COUNTS):
|
190 |
new_df = orig_df[meta_data + columns_name]
|
@@ -200,28 +174,63 @@ def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=MET
|
|
200 |
new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
|
201 |
return new_df
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
-
results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/detailed-results")
|
205 |
-
orig_df = get_leaderboard_results(results_path)
|
206 |
-
colmuns_name = list(SUBSET_COUNTS.keys())
|
207 |
-
detailed_df = avg_all_subset(orig_df, colmuns_name).round(2)
|
208 |
-
|
209 |
-
results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/overall-results")
|
210 |
-
orig_df = get_leaderboard_results(results_path)
|
211 |
-
colmuns_name = list(PERSPECTIVE_COUNTS.keys())
|
212 |
-
perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2)
|
213 |
-
|
214 |
-
total_models = len(detailed_df)
|
215 |
with gr.Blocks(css=custom_css) as app:
|
216 |
with gr.Row():
|
217 |
with gr.Column(scale=6):
|
218 |
gr.Markdown(INTRODUCTION_TEXT.format(str(total_models)))
|
219 |
with gr.Column(scale=4):
|
220 |
-
gr.Markdown("![](https://huggingface.co/spaces/
|
221 |
# gr.HTML(BGB_LOGO, elem_classes="logo")
|
222 |
|
223 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
224 |
-
with gr.TabItem("🏆
|
225 |
with gr.Row():
|
226 |
search_overall = gr.Textbox(
|
227 |
label="Model Search (delimit with , )",
|
@@ -229,88 +238,46 @@ with gr.Blocks(css=custom_css) as app:
|
|
229 |
show_label=False
|
230 |
)
|
231 |
model_type_overall = gr.CheckboxGroup(
|
232 |
-
choices=["
|
233 |
-
value=["
|
234 |
-
label="Model
|
235 |
show_label=False,
|
236 |
interactive=True,
|
237 |
)
|
238 |
with gr.Row():
|
239 |
-
|
240 |
-
|
241 |
-
headers=
|
242 |
-
elem_id="
|
243 |
wrap=True,
|
244 |
visible=False,
|
245 |
)
|
246 |
-
|
247 |
regex_table(
|
248 |
-
|
249 |
"",
|
250 |
-
["
|
251 |
),
|
252 |
-
headers=
|
253 |
-
elem_id="
|
254 |
wrap=True,
|
255 |
height=1000,
|
256 |
)
|
257 |
-
# with gr.TabItem("🔍 MJ-Bench Detailed Results"):
|
258 |
-
# with gr.Row():
|
259 |
-
# search_detail = gr.Textbox(
|
260 |
-
# label="Model Search (delimit with , )",
|
261 |
-
# placeholder="🔍 Search model (separate multiple queries with ``) and press ENTER...",
|
262 |
-
# show_label=False
|
263 |
-
# )
|
264 |
-
# model_type_detail = gr.CheckboxGroup(
|
265 |
-
# choices=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
|
266 |
-
# value=["Score Model", "Opensource VLM", "Closesource VLM", "Others"],
|
267 |
-
# label="Model Types",
|
268 |
-
# show_label=False,
|
269 |
-
# interactive=True,
|
270 |
-
# )
|
271 |
-
# with gr.Row():
|
272 |
-
# mjbench_table_detail_hidden = gr.Dataframe(
|
273 |
-
# detailed_df,
|
274 |
-
# headers=detailed_df.columns.tolist(),
|
275 |
-
# elem_id="mjbench_detailed_hidden",
|
276 |
-
# # column_widths = ["500px", "500px"],
|
277 |
-
# wrap=True,
|
278 |
-
# visible=False,
|
279 |
-
# )
|
280 |
-
# mjbench_table_detail = gr.Dataframe(
|
281 |
-
# regex_table(
|
282 |
-
# detailed_df.copy(),
|
283 |
-
# "",
|
284 |
-
# ["Score Model", "Opensource VLM", "Closesource VLM", "Others"]
|
285 |
-
# ),
|
286 |
-
# headers=detailed_df.columns.tolist(),
|
287 |
-
# elem_id="mjbench_detailed",
|
288 |
-
# column_widths = ["40px", "200px", "180px", "130px", "150px"] + ["130px"]*50,
|
289 |
-
# wrap=True,
|
290 |
-
# height=1000,
|
291 |
-
# )
|
292 |
with gr.TabItem("About"):
|
293 |
with gr.Row():
|
294 |
gr.Markdown(ABOUT_TEXT)
|
295 |
|
296 |
with gr.Accordion("📚 Citation", open=False):
|
297 |
citation_button = gr.Textbox(
|
298 |
-
value=r"""
|
299 |
-
title={MJ-BENCH: Is Your Multimodal Reward Model Really a Good Judge?},
|
300 |
-
author={Chen*, Zhaorun and Du*, Yichao and Wen, Zichen and Zhou, Yiyang and Cui, Chenhang and Weng, Zhenzhen and Tu, Haoqin and Wang, Chaoqi and Tong, Zhengwei and HUANG, Leria and Chen, Canyu and Ye Qinghao and Zhu, Zhihong and Zhang, Yuqing and Zhou, Jiawei and Zhao, Zhuokai and Rafailov, Rafael and Finn, Chelsea and Yao, Huaxiu},
|
301 |
-
year={2024}
|
302 |
-
}""",
|
303 |
lines=7,
|
304 |
label="Copy the following to cite these results.",
|
305 |
elem_id="citation-button",
|
306 |
show_copy_button=True,
|
307 |
)
|
308 |
|
309 |
-
search_overall.change(regex_table, inputs=[
|
310 |
-
model_type_overall.change(regex_table, inputs=[
|
311 |
-
|
312 |
-
# search_detail.change(regex_table, inputs=[mjbench_table_detail_hidden, search_detail, model_type_detail], outputs=mjbench_table_detail)
|
313 |
-
# model_type_detail.change(regex_table, inputs=[mjbench_table_detail_hidden, search_detail, model_type_detail], outputs=mjbench_table_detail)
|
314 |
|
315 |
scheduler = BackgroundScheduler()
|
316 |
scheduler.add_job(restart_space, "interval", seconds=18000) # restarted every 3h
|
|
|
7 |
from pathlib import Path
|
8 |
from apscheduler.schedulers.background import BackgroundScheduler
|
9 |
from huggingface_hub import snapshot_download
|
|
|
10 |
|
11 |
|
12 |
from src.about import (
|
|
|
19 |
ABOUT_TEXT
|
20 |
)
|
21 |
from src.display.css_html_js import custom_css
|
22 |
+
# from src.display.utils import (
|
23 |
+
# BENCHMARK_COLS,
|
24 |
+
# COLS,
|
25 |
+
# EVAL_COLS,
|
26 |
+
# EVAL_TYPES,
|
27 |
+
# NUMERIC_INTERVALS,
|
28 |
+
# TYPES,
|
29 |
+
# AutoEvalColumn,
|
30 |
+
# ModelType,
|
31 |
+
# fields,
|
32 |
+
# WeightType,
|
33 |
+
# Precision
|
34 |
+
# )
|
35 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
36 |
|
37 |
try:
|
|
|
75 |
|
76 |
|
77 |
|
78 |
+
META_DATA = ['Model']
|
79 |
|
80 |
|
81 |
|
|
|
83 |
API.restart_space(repo_id=REPO_ID)
|
84 |
|
85 |
|
86 |
+
# color_map = {
|
87 |
+
# "Score Model": "#7497db",
|
88 |
+
# "Opensource VLM": "#E8ECF2",
|
89 |
+
# "Closesource VLM": "#ffcd75",
|
90 |
+
# "Others": "#75809c",
|
91 |
+
|
92 |
+
# # #7497db #E8ECF2 #ffcd75 #75809c
|
93 |
+
# }
|
94 |
+
# def color_model_type_column(df, color_map):
|
95 |
+
# """
|
96 |
+
# Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
|
97 |
+
|
98 |
+
# Parameters:
|
99 |
+
# df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
|
100 |
+
# color_map (dict): A dictionary mapping model types to colors.
|
101 |
+
|
102 |
+
# Returns:
|
103 |
+
# pd.Styler: The styled DataFrame.
|
104 |
+
# """
|
105 |
+
# # Function to apply color based on the model type
|
106 |
+
# def apply_color(val):
|
107 |
+
# color = color_map.get(val, "default") # Default color if not specified in color_map
|
108 |
+
# return f'background-color: {color}'
|
109 |
|
110 |
+
# # Format for different columns
|
111 |
+
# format_dict = {col: "{:.1f}" for col in df.columns if col not in META_DATA}
|
112 |
+
# format_dict['Overall Score'] = "{:.2f}"
|
113 |
+
# format_dict[''] = "{:d}"
|
114 |
|
115 |
+
# return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
|
116 |
|
117 |
def regex_table(dataframe, regex, filter_button, style=True):
|
118 |
"""
|
|
|
126 |
# if filter_button, remove all rows with "ai2" in the model name
|
127 |
update_scores = False
|
128 |
if isinstance(filter_button, list) or isinstance(filter_button, str):
|
129 |
+
if "Integrated LVLM" not in filter_button:
|
130 |
+
dataframe = dataframe[~dataframe["Model Type"].str.contains("Integrated LVLM", case=False, na=False)]
|
131 |
+
if "Interleaved LVLM" not in filter_button:
|
132 |
+
dataframe = dataframe[~dataframe["Model Type"].str.contains("Interleaved LVLM", case=False, na=False)]
|
|
|
|
|
|
|
|
|
133 |
# Filter the dataframe such that 'model' contains any of the regex patterns
|
134 |
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
|
135 |
|
|
|
138 |
# replace column '' with count/rank
|
139 |
data.insert(0, '', range(1, 1 + len(data)))
|
140 |
|
141 |
+
# if style:
|
142 |
+
# # apply color
|
143 |
+
# data = color_model_type_column(data, color_map)
|
144 |
|
145 |
return data
|
146 |
|
|
|
159 |
df.reset_index(drop=True, inplace=True)
|
160 |
return df
|
161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, perspective_counts=PERSPECTIVE_COUNTS):
|
164 |
new_df = orig_df[meta_data + columns_name]
|
|
|
174 |
new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True)
|
175 |
return new_df
|
176 |
|
177 |
+
data = {
|
178 |
+
"Model": [
|
179 |
+
"MiniGPT-5", "EMU-2", "GILL", "Anole",
|
180 |
+
"GPT-4o - Openjourney", "GPT-4o - SD-3", "GPT-4o - SD-XL", "GPT-4o - Flux",
|
181 |
+
"Gemini-1.5 - Openjourney", "Gemini-1.5 - SD-3", "Gemini-1.5 - SD-XL", "Gemini-1.5 - Flux",
|
182 |
+
"LLAVA-34b - Openjourney", "LLAVA-34b - SD-3", "LLAVA-34b - SD-XL", "LLAVA-34b - Flux",
|
183 |
+
"Qwen-VL-70b - Openjourney", "Qwen-VL-70b - SD-3", "Qwen-VL-70b - SD-XL", "Qwen-VL-70b - Flux"
|
184 |
+
],
|
185 |
+
"Model Type":[
|
186 |
+
"Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM",
|
187 |
+
"Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
|
188 |
+
"Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
|
189 |
+
"Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
|
190 |
+
"Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM",
|
191 |
+
],
|
192 |
+
"Situational analysis": [
|
193 |
+
47.63, 39.65, 46.72, 48.95,
|
194 |
+
53.05, 53.00, 56.12, 54.97,
|
195 |
+
48.08, 47.48, 49.43, 47.07,
|
196 |
+
54.12, 54.72, 55.97, 54.23,
|
197 |
+
52.73, 54.98, 52.58, 54.23
|
198 |
+
],
|
199 |
+
"Project-based learning": [
|
200 |
+
55.12, 46.12, 57.57, 59.05,
|
201 |
+
71.40, 71.20, 73.25, 68.80,
|
202 |
+
67.93, 68.70, 71.85, 68.33,
|
203 |
+
73.47, 72.55, 74.60, 71.32,
|
204 |
+
71.63, 71.87, 73.57, 69.47
|
205 |
+
],
|
206 |
+
"Multi-step reasoning": [
|
207 |
+
42.17, 50.75, 39.33, 51.72,
|
208 |
+
53.67, 53.67, 53.67, 53.67,
|
209 |
+
60.05, 60.05, 60.05, 60.05,
|
210 |
+
47.28, 47.28, 47.28, 47.28,
|
211 |
+
55.63, 55.63, 55.63, 55.63
|
212 |
+
],
|
213 |
+
"AVG": [
|
214 |
+
50.92, 45.33, 51.58, 55.22,
|
215 |
+
63.65, 63.52, 65.47, 62.63,
|
216 |
+
61.57, 61.87, 64.15, 61.55,
|
217 |
+
63.93, 63.57, 65.05, 62.73,
|
218 |
+
64.05, 64.75, 65.12, 63.18
|
219 |
+
]
|
220 |
+
}
|
221 |
+
df = pd.DataFrame(data)
|
222 |
+
total_models = len(df)
|
223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
with gr.Blocks(css=custom_css) as app:
|
225 |
with gr.Row():
|
226 |
with gr.Column(scale=6):
|
227 |
gr.Markdown(INTRODUCTION_TEXT.format(str(total_models)))
|
228 |
with gr.Column(scale=4):
|
229 |
+
gr.Markdown("![](https://huggingface.co/spaces/MMIE/Leaderboard/resolve/main/src/logo.png)")
|
230 |
# gr.HTML(BGB_LOGO, elem_classes="logo")
|
231 |
|
232 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
233 |
+
with gr.TabItem("🏆 MMIE Leaderboard"):
|
234 |
with gr.Row():
|
235 |
search_overall = gr.Textbox(
|
236 |
label="Model Search (delimit with , )",
|
|
|
238 |
show_label=False
|
239 |
)
|
240 |
model_type_overall = gr.CheckboxGroup(
|
241 |
+
choices=["Interleaved LVLM", "Integrated LVLM"],
|
242 |
+
value=["Interleaved LVLM", "Integrated LVLM"],
|
243 |
+
label="Model Type",
|
244 |
show_label=False,
|
245 |
interactive=True,
|
246 |
)
|
247 |
with gr.Row():
|
248 |
+
mmie_table_overall_hidden = gr.Dataframe(
|
249 |
+
df,
|
250 |
+
headers=df.columns.tolist(),
|
251 |
+
elem_id="mmie_leadboard_overall_hidden",
|
252 |
wrap=True,
|
253 |
visible=False,
|
254 |
)
|
255 |
+
mmie_table_overall = gr.Dataframe(
|
256 |
regex_table(
|
257 |
+
df.copy(),
|
258 |
"",
|
259 |
+
["Interleaved LVLM", "Integrated LVLM"]
|
260 |
),
|
261 |
+
headers=df.columns.tolist(),
|
262 |
+
elem_id="mmie_leadboard_overall",
|
263 |
wrap=True,
|
264 |
height=1000,
|
265 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
with gr.TabItem("About"):
|
267 |
with gr.Row():
|
268 |
gr.Markdown(ABOUT_TEXT)
|
269 |
|
270 |
with gr.Accordion("📚 Citation", open=False):
|
271 |
citation_button = gr.Textbox(
|
272 |
+
value=r"""""",
|
|
|
|
|
|
|
|
|
273 |
lines=7,
|
274 |
label="Copy the following to cite these results.",
|
275 |
elem_id="citation-button",
|
276 |
show_copy_button=True,
|
277 |
)
|
278 |
|
279 |
+
search_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall)
|
280 |
+
model_type_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall)
|
|
|
|
|
|
|
281 |
|
282 |
scheduler = BackgroundScheduler()
|
283 |
scheduler.add_job(restart_space, "interval", seconds=18000) # restarted every 3h
|
evals/.gitattributes
DELETED
@@ -1,55 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
-
# Audio files - uncompressed
|
38 |
-
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
-
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
-
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
-
# Audio files - compressed
|
42 |
-
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
-
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
-
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
-
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
-
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
-
# Image files - uncompressed
|
48 |
-
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
-
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
-
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
-
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
-
# Image files - compressed
|
53 |
-
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
-
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
-
*.webp filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/README.md
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
---
|
2 |
-
# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
|
3 |
-
# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
|
4 |
-
{}
|
5 |
-
---
|
6 |
-
# Coming Soon
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/AestheticsPredictor.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "AestheticsPredictor",
|
4 |
-
"Model Type": "Score Model",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "LAION",
|
7 |
-
"Alignment-Object": 35.9,
|
8 |
-
"Alignment-Attribute": 38.4,
|
9 |
-
"Alignment-Action": 43.6,
|
10 |
-
"Alignment-Location": 31.6,
|
11 |
-
"Alignment-Count": 35.7,
|
12 |
-
"Alignment-Avg": 34.8,
|
13 |
-
"Safety-Toxicity-Crime": 51.7,
|
14 |
-
"Safety-Toxicity-Shocking": 58.6,
|
15 |
-
"Safety-Toxicity-Disgust": 64.3,
|
16 |
-
"Safety-Toxicity-Avg": 57.3,
|
17 |
-
"Safety-Nsfw-Evident": 14.6,
|
18 |
-
"Safety-Nsfw-Evasive": 55.2,
|
19 |
-
"Safety-Nsfw-Subtle": 14.2,
|
20 |
-
"Safety-Nsfw-Avg": 37.5,
|
21 |
-
"Quality-Distortion-Human_face": 78.7,
|
22 |
-
"Quality-Distortion-Human_limb": 57.1,
|
23 |
-
"Quality-Distortion-Object": 51.3,
|
24 |
-
"Quality-Distortion-Avg": 52.1,
|
25 |
-
"Quality-Blurry-Defocused": 90.1,
|
26 |
-
"Quality-Blurry-Motion": 93.4,
|
27 |
-
"Quality-Blurry-Avg": 91.6,
|
28 |
-
"Bias-Age": 59.4,
|
29 |
-
"Bias-Gender": 62.0,
|
30 |
-
"Bias-Race": 64.2,
|
31 |
-
"Bias-Nationality": 62.4,
|
32 |
-
"Bias-Religion": 61.0,
|
33 |
-
"Bias-Avg": 62.0,
|
34 |
-
"Bias-Age-NDS": 85.3,
|
35 |
-
"Bias-Gender-NDS": 85.9,
|
36 |
-
"Bias-Race-NDS": 86.3,
|
37 |
-
"Bias-Nationality-NDS": 85.8,
|
38 |
-
"Bias-Religion-NDS": 86.2,
|
39 |
-
"Bias-Avg-NDS": 85.9,
|
40 |
-
"Bias-Age-GES": 91.9,
|
41 |
-
"Bias-Gender-GES": 92.1,
|
42 |
-
"Bias-Race-GES": 92.4,
|
43 |
-
"Bias-Nationality-GES": 92.1,
|
44 |
-
"Bias-Religion-GES": 92.3,
|
45 |
-
"Bias-Avg-GES": 92.1
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/BLIP-v2.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "BLIP-v2",
|
4 |
-
"Model Type": "Score Model",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "Salesforce",
|
7 |
-
"Alignment-Object": 23.5,
|
8 |
-
"Alignment-Attribute": 22.7,
|
9 |
-
"Alignment-Action": 24.8,
|
10 |
-
"Alignment-Location": 19.7,
|
11 |
-
"Alignment-Count": 16.1,
|
12 |
-
"Alignment-Avg": 21.5,
|
13 |
-
"Safety-Toxicity-Crime": 6.9,
|
14 |
-
"Safety-Toxicity-Shocking": 0.0,
|
15 |
-
"Safety-Toxicity-Disgust": 4.8,
|
16 |
-
"Safety-Toxicity-Avg": 4.5,
|
17 |
-
"Safety-Nsfw-Evident": 58.4,
|
18 |
-
"Safety-Nsfw-Evasive": 51.1,
|
19 |
-
"Safety-Nsfw-Subtle": 35.7,
|
20 |
-
"Safety-Nsfw-Avg": 49.1,
|
21 |
-
"Quality-Distortion-Human_face": 3.6,
|
22 |
-
"Quality-Distortion-Human_limb": 2.0,
|
23 |
-
"Quality-Distortion-Object": 1.1,
|
24 |
-
"Quality-Distortion-Avg": 1.9,
|
25 |
-
"Quality-Blurry-Defocused": 8.3,
|
26 |
-
"Quality-Blurry-Motion": 47.2,
|
27 |
-
"Quality-Blurry-Avg": 15.0,
|
28 |
-
"Bias-Age": 69.6,
|
29 |
-
"Bias-Gender": 68.5,
|
30 |
-
"Bias-Race": 65.9,
|
31 |
-
"Bias-Nationality": 68.6,
|
32 |
-
"Bias-Religion": 74.7,
|
33 |
-
"Bias-Avg": 68.5,
|
34 |
-
"Bias-Age-NDS": 85.3,
|
35 |
-
"Bias-Gender-NDS": 83.6,
|
36 |
-
"Bias-Race-NDS": 82.7,
|
37 |
-
"Bias-Nationality-NDS": 81.8,
|
38 |
-
"Bias-Religion-NDS": 87.5,
|
39 |
-
"Bias-Avg-NDS": 83.6,
|
40 |
-
"Bias-Age-GES": 92.2,
|
41 |
-
"Bias-Gender-GES": 91.3,
|
42 |
-
"Bias-Race-GES": 90.7,
|
43 |
-
"Bias-Nationality-GES": 90.4,
|
44 |
-
"Bias-Religion-GES": 93.1,
|
45 |
-
"Bias-Avg-GES": 91.3
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/CLIP-v2.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "CLIP-v2",
|
4 |
-
"Model Type": "Score Model",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "LAION",
|
7 |
-
"Alignment-Object": 42.2,
|
8 |
-
"Alignment-Attribute": 45.9,
|
9 |
-
"Alignment-Action": 45.3,
|
10 |
-
"Alignment-Location": 43.4,
|
11 |
-
"Alignment-Count": 55.4,
|
12 |
-
"Alignment-Avg": 44.0,
|
13 |
-
"Safety-Toxicity-Crime": 89.7,
|
14 |
-
"Safety-Toxicity-Shocking": 96.6,
|
15 |
-
"Safety-Toxicity-Disgust": 97.6,
|
16 |
-
"Safety-Toxicity-Avg": 94.4,
|
17 |
-
"Safety-Nsfw-Evident": 20.8,
|
18 |
-
"Safety-Nsfw-Evasive": 4.5,
|
19 |
-
"Safety-Nsfw-Subtle": 16.6,
|
20 |
-
"Safety-Nsfw-Avg": 7.9,
|
21 |
-
"Quality-Distortion-Human_face": 26.6,
|
22 |
-
"Quality-Distortion-Human_limb": 17.2,
|
23 |
-
"Quality-Distortion-Object": 34.0,
|
24 |
-
"Quality-Distortion-Avg": 19.3,
|
25 |
-
"Quality-Blurry-Defocused": 50.6,
|
26 |
-
"Quality-Blurry-Motion": 63.7,
|
27 |
-
"Quality-Blurry-Avg": 56.7,
|
28 |
-
"Bias-Age": 57.2,
|
29 |
-
"Bias-Gender": 57.8,
|
30 |
-
"Bias-Race": 55.5,
|
31 |
-
"Bias-Nationality": 59.5,
|
32 |
-
"Bias-Religion": 60.8,
|
33 |
-
"Bias-Avg": 57.7,
|
34 |
-
"Bias-Age-NDS": 73.6,
|
35 |
-
"Bias-Gender-NDS": 75.2,
|
36 |
-
"Bias-Race-NDS": 73.1,
|
37 |
-
"Bias-Nationality-NDS": 79.1,
|
38 |
-
"Bias-Religion-NDS": 78.4,
|
39 |
-
"Bias-Avg-NDS": 75.2,
|
40 |
-
"Bias-Age-GES": 73.6,
|
41 |
-
"Bias-Gender-GES": 75.2,
|
42 |
-
"Bias-Race-GES": 73.1,
|
43 |
-
"Bias-Nationality-GES": 79.1,
|
44 |
-
"Bias-Religion-GES": 78.4,
|
45 |
-
"Bias-Avg-GES": 75.2
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/Claude 3 Opus.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Claude 3 Opus",
|
4 |
-
"Model Type": "Closesource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "Anthropic",
|
7 |
-
"Alignment-Object": 64.9,
|
8 |
-
"Alignment-Attribute": 38.9,
|
9 |
-
"Alignment-Action": 44.4,
|
10 |
-
"Alignment-Location": 55.3,
|
11 |
-
"Alignment-Count": 55.4,
|
12 |
-
"Alignment-Avg": 57.1,
|
13 |
-
"Safety-Toxicity-Crime": 62.1,
|
14 |
-
"Safety-Toxicity-Shocking": 37.9,
|
15 |
-
"Safety-Toxicity-Disgust": 50.0,
|
16 |
-
"Safety-Toxicity-Avg": 50.6,
|
17 |
-
"Safety-Nsfw-Evident": 10.5,
|
18 |
-
"Safety-Nsfw-Evasive": 6.2,
|
19 |
-
"Safety-Nsfw-Subtle": 3.6,
|
20 |
-
"Safety-Nsfw-Avg": 8.3,
|
21 |
-
"Quality-Distortion-Human_face": 26.6,
|
22 |
-
"Quality-Distortion-Human_limb": 19.3,
|
23 |
-
"Quality-Distortion-Object": 10.7,
|
24 |
-
"Quality-Distortion-Avg": 17.6,
|
25 |
-
"Quality-Blurry-Defocused": 89.6,
|
26 |
-
"Quality-Blurry-Motion": 93.3,
|
27 |
-
"Quality-Blurry-Avg": 92.7,
|
28 |
-
"Bias-Age": 53.9,
|
29 |
-
"Bias-Gender": 58.2,
|
30 |
-
"Bias-Race": 62.1,
|
31 |
-
"Bias-Nationality": 59.0,
|
32 |
-
"Bias-Religion": 54.0,
|
33 |
-
"Bias-Avg": 58.2,
|
34 |
-
"Bias-Age-NDS": 63.3,
|
35 |
-
"Bias-Gender-NDS": 66.1,
|
36 |
-
"Bias-Race-NDS": 67.5,
|
37 |
-
"Bias-Nationality-NDS": 66.9,
|
38 |
-
"Bias-Religion-NDS": 66.8,
|
39 |
-
"Bias-Avg-NDS": 66.1,
|
40 |
-
"Bias-Age-GES": 83.2,
|
41 |
-
"Bias-Gender-GES": 85.2,
|
42 |
-
"Bias-Race-GES": 86.5,
|
43 |
-
"Bias-Nationality-GES": 85.8,
|
44 |
-
"Bias-Religion-GES": 84.8,
|
45 |
-
"Bias-Avg-GES": 85.2
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/GPT-4-vision.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "GPT-4-vision",
|
4 |
-
"Model Type": "Closesource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "OpenAI",
|
7 |
-
"Alignment-Object": 68.1,
|
8 |
-
"Alignment-Attribute": 62.9,
|
9 |
-
"Alignment-Action": 64.1,
|
10 |
-
"Alignment-Location": 67.1,
|
11 |
-
"Alignment-Count": 73.2,
|
12 |
-
"Alignment-Avg": 66.1,
|
13 |
-
"Safety-Toxicity-Crime": 75.9,
|
14 |
-
"Safety-Toxicity-Shocking": 69.0,
|
15 |
-
"Safety-Toxicity-Disgust": 81.0,
|
16 |
-
"Safety-Toxicity-Avg": 76.4,
|
17 |
-
"Safety-Nsfw-Evident": 69.5,
|
18 |
-
"Safety-Nsfw-Evasive": 43.2,
|
19 |
-
"Safety-Nsfw-Subtle": 32.5,
|
20 |
-
"Safety-Nsfw-Avg": 44.1,
|
21 |
-
"Quality-Distortion-Human_face": 87.6,
|
22 |
-
"Quality-Distortion-Human_limb": 57.6,
|
23 |
-
"Quality-Distortion-Object": 83.1,
|
24 |
-
"Quality-Distortion-Avg": 75.7,
|
25 |
-
"Quality-Blurry-Defocused": 98.8,
|
26 |
-
"Quality-Blurry-Motion": 99.3,
|
27 |
-
"Quality-Blurry-Avg": 99.2,
|
28 |
-
"Bias-Age": 76.7,
|
29 |
-
"Bias-Gender": 79.1,
|
30 |
-
"Bias-Race": 77.4,
|
31 |
-
"Bias-Nationality": 81.0,
|
32 |
-
"Bias-Religion": 86.5,
|
33 |
-
"Bias-Avg": 79.1,
|
34 |
-
"Bias-Age-NDS": 81.2,
|
35 |
-
"Bias-Gender-NDS": 80.2,
|
36 |
-
"Bias-Race-NDS": 77.6,
|
37 |
-
"Bias-Nationality-NDS": 79.9,
|
38 |
-
"Bias-Religion-NDS": 88.2,
|
39 |
-
"Bias-Avg-NDS": 80.2,
|
40 |
-
"Bias-Age-GES": 93.0,
|
41 |
-
"Bias-Gender-GES": 93.2,
|
42 |
-
"Bias-Race-GES": 92.2,
|
43 |
-
"Bias-Nationality-GES": 93.4,
|
44 |
-
"Bias-Religion-GES": 96.4,
|
45 |
-
"Bias-Avg-GES": 93.2
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/GPT-4o.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "GPT-4o",
|
4 |
-
"Model Type": "Closesource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "OpenAI",
|
7 |
-
"Alignment-Object": 62.2,
|
8 |
-
"Alignment-Attribute": 57.2,
|
9 |
-
"Alignment-Action": 64.1,
|
10 |
-
"Alignment-Location": 63.2,
|
11 |
-
"Alignment-Count": 67.9,
|
12 |
-
"Alignment-Avg": 61.5,
|
13 |
-
"Safety-Toxicity-Crime": 86.2,
|
14 |
-
"Safety-Toxicity-Shocking": 96.6,
|
15 |
-
"Safety-Toxicity-Disgust": 95.2,
|
16 |
-
"Safety-Toxicity-Avg": 92.1,
|
17 |
-
"Safety-Nsfw-Evident": 72.3,
|
18 |
-
"Safety-Nsfw-Evasive": 51.7,
|
19 |
-
"Safety-Nsfw-Subtle": 38.9,
|
20 |
-
"Safety-Nsfw-Avg": 54.3,
|
21 |
-
"Quality-Distortion-Human_face": 99.4,
|
22 |
-
"Quality-Distortion-Human_limb": 78.2,
|
23 |
-
"Quality-Distortion-Object": 100.0,
|
24 |
-
"Quality-Distortion-Avg": 93.8,
|
25 |
-
"Quality-Blurry-Defocused": 100.0,
|
26 |
-
"Quality-Blurry-Motion": 100.0,
|
27 |
-
"Quality-Blurry-Avg": 100.0,
|
28 |
-
"Bias-Age": 60.9,
|
29 |
-
"Bias-Gender": 66.6,
|
30 |
-
"Bias-Race": 69.1,
|
31 |
-
"Bias-Nationality": 68.2,
|
32 |
-
"Bias-Religion": 69.6,
|
33 |
-
"Bias-Avg": 66.6,
|
34 |
-
"Bias-Age-NDS": 81.2,
|
35 |
-
"Bias-Gender-NDS": 82.7,
|
36 |
-
"Bias-Race-NDS": 82.8,
|
37 |
-
"Bias-Nationality-NDS": 83.2,
|
38 |
-
"Bias-Religion-NDS": 86.1,
|
39 |
-
"Bias-Avg-NDS": 82.7,
|
40 |
-
"Bias-Age-GES": 91.8,
|
41 |
-
"Bias-Gender-GES": 92.9,
|
42 |
-
"Bias-Race-GES": 93.1,
|
43 |
-
"Bias-Nationality-GES": 93.3,
|
44 |
-
"Bias-Religion-GES": 94.4,
|
45 |
-
"Bias-Avg-GES": 92.9
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/Gemini Ultra.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Gemini Ultra",
|
4 |
-
"Model Type": "Closesource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "Google",
|
7 |
-
"Alignment-Object": 71.7,
|
8 |
-
"Alignment-Attribute": 65.1,
|
9 |
-
"Alignment-Action": 63.2,
|
10 |
-
"Alignment-Location": 64.5,
|
11 |
-
"Alignment-Count": 67.8,
|
12 |
-
"Alignment-Avg": 67.2,
|
13 |
-
"Safety-Toxicity-Crime": 65.5,
|
14 |
-
"Safety-Toxicity-Shocking": 41.4,
|
15 |
-
"Safety-Toxicity-Disgust": 78.6,
|
16 |
-
"Safety-Toxicity-Avg": 64.0,
|
17 |
-
"Safety-Nsfw-Evident": 31.6,
|
18 |
-
"Safety-Nsfw-Evasive": 19.1,
|
19 |
-
"Safety-Nsfw-Subtle": 10.3,
|
20 |
-
"Safety-Nsfw-Avg": 22.7,
|
21 |
-
"Quality-Distortion-Human_face": 73.4,
|
22 |
-
"Quality-Distortion-Human_limb": 32.5,
|
23 |
-
"Quality-Distortion-Object": 61.0,
|
24 |
-
"Quality-Distortion-Avg": 55.7,
|
25 |
-
"Quality-Blurry-Defocused": 86.5,
|
26 |
-
"Quality-Blurry-Motion": 97.3,
|
27 |
-
"Quality-Blurry-Avg": 93.9,
|
28 |
-
"Bias-Age": 48.7,
|
29 |
-
"Bias-Gender": 56.9,
|
30 |
-
"Bias-Race": 62.9,
|
31 |
-
"Bias-Nationality": 60.0,
|
32 |
-
"Bias-Religion": 49.9,
|
33 |
-
"Bias-Avg": 56.9,
|
34 |
-
"Bias-Age-NDS": 72.6,
|
35 |
-
"Bias-Gender-NDS": 75.8,
|
36 |
-
"Bias-Race-NDS": 78.4,
|
37 |
-
"Bias-Nationality-NDS": 77.0,
|
38 |
-
"Bias-Religion-NDS": 72.3,
|
39 |
-
"Bias-Avg-NDS": 75.8,
|
40 |
-
"Bias-Age-GES": 86.6,
|
41 |
-
"Bias-Gender-GES": 89.0,
|
42 |
-
"Bias-Race-GES": 90.8,
|
43 |
-
"Bias-Nationality-GES": 90.0,
|
44 |
-
"Bias-Religion-GES": 86.2,
|
45 |
-
"Bias-Avg-GES": 89.0
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/HPS-v2.1.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "HPS-v2.1",
|
4 |
-
"Model Type": "Score Model",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "CUHK MMLab",
|
7 |
-
"Alignment-Object": 49.4,
|
8 |
-
"Alignment-Attribute": 53.7,
|
9 |
-
"Alignment-Action": 49.6,
|
10 |
-
"Alignment-Location": 51.3,
|
11 |
-
"Alignment-Count": 57.1,
|
12 |
-
"Alignment-Avg": 48.8,
|
13 |
-
"Safety-Toxicity-Crime": 89.7,
|
14 |
-
"Safety-Toxicity-Shocking": 86.2,
|
15 |
-
"Safety-Toxicity-Disgust": 85.7,
|
16 |
-
"Safety-Toxicity-Avg": 87.6,
|
17 |
-
"Safety-Nsfw-Evident": 1.1,
|
18 |
-
"Safety-Nsfw-Evasive": 30.8,
|
19 |
-
"Safety-Nsfw-Subtle": 0.6,
|
20 |
-
"Safety-Nsfw-Avg": 15.1,
|
21 |
-
"Quality-Distortion-Human_face": 60.4,
|
22 |
-
"Quality-Distortion-Human_limb": 37.1,
|
23 |
-
"Quality-Distortion-Object": 80.3,
|
24 |
-
"Quality-Distortion-Avg": 51.7,
|
25 |
-
"Quality-Blurry-Defocused": 85.7,
|
26 |
-
"Quality-Blurry-Motion": 94.6,
|
27 |
-
"Quality-Blurry-Avg": 88.6,
|
28 |
-
"Bias-Age": 52.9,
|
29 |
-
"Bias-Gender": 55.3,
|
30 |
-
"Bias-Race": 55.7,
|
31 |
-
"Bias-Nationality": 55.0,
|
32 |
-
"Bias-Religion": 62.4,
|
33 |
-
"Bias-Avg": 55.3,
|
34 |
-
"Bias-Age-NDS": 75.8,
|
35 |
-
"Bias-Gender-NDS": 78.2,
|
36 |
-
"Bias-Race-NDS": 79.5,
|
37 |
-
"Bias-Nationality-NDS": 78.6,
|
38 |
-
"Bias-Religion-NDS": 79.3,
|
39 |
-
"Bias-Avg-NDS": 78.2,
|
40 |
-
"Bias-Age-GES": 86.4,
|
41 |
-
"Bias-Gender-GES": 87.8,
|
42 |
-
"Bias-Race-GES": 88.5,
|
43 |
-
"Bias-Nationality-GES": 88.0,
|
44 |
-
"Bias-Religion-GES": 88.5,
|
45 |
-
"Bias-Avg-GES": 87.8
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/Idefics2-8b.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Idefics2-8b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "HuggingFace",
|
7 |
-
"Alignment-Object": 35.5,
|
8 |
-
"Alignment-Attribute": 31.7,
|
9 |
-
"Alignment-Action": 30.8,
|
10 |
-
"Alignment-Location": 29.9,
|
11 |
-
"Alignment-Count": 30.4,
|
12 |
-
"Alignment-Avg": 32.6,
|
13 |
-
"Safety-Toxicity-Crime": 58.6,
|
14 |
-
"Safety-Toxicity-Shocking": 44.8,
|
15 |
-
"Safety-Toxicity-Disgust": 57.1,
|
16 |
-
"Safety-Toxicity-Avg": 52.8,
|
17 |
-
"Safety-Nsfw-Evident": 32.9,
|
18 |
-
"Safety-Nsfw-Evasive": 13.2,
|
19 |
-
"Safety-Nsfw-Subtle": 19.5,
|
20 |
-
"Safety-Nsfw-Avg": 20.2,
|
21 |
-
"Quality-Distortion-Human_face": 29.6,
|
22 |
-
"Quality-Distortion-Human_limb": 25.8,
|
23 |
-
"Quality-Distortion-Object": 2.3,
|
24 |
-
"Quality-Distortion-Avg": 21.7,
|
25 |
-
"Quality-Blurry-Defocused": 70.6,
|
26 |
-
"Quality-Blurry-Motion": 46.9,
|
27 |
-
"Quality-Blurry-Avg": 58.7,
|
28 |
-
"Bias-Age": 37.4,
|
29 |
-
"Bias-Gender": 42.7,
|
30 |
-
"Bias-Race": 45.3,
|
31 |
-
"Bias-Nationality": 46.9,
|
32 |
-
"Bias-Religion": 35.2,
|
33 |
-
"Bias-Avg": 42.7,
|
34 |
-
"Bias-Age-NDS": 55.1,
|
35 |
-
"Bias-Gender-NDS": 59.2,
|
36 |
-
"Bias-Race-NDS": 61.7,
|
37 |
-
"Bias-Nationality-NDS": 62.8,
|
38 |
-
"Bias-Religion-NDS": 51.0,
|
39 |
-
"Bias-Avg-NDS": 59.2,
|
40 |
-
"Bias-Age-GES": 77.0,
|
41 |
-
"Bias-Gender-GES": 79.7,
|
42 |
-
"Bias-Race-GES": 81.3,
|
43 |
-
"Bias-Nationality-GES": 82.0,
|
44 |
-
"Bias-Religion-GES": 74.4,
|
45 |
-
"Bias-Avg-GES": 79.8
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/ImageReward.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "ImageReward",
|
4 |
-
"Model Type": "Score Model",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "THUDM",
|
7 |
-
"Alignment-Object": 50.6,
|
8 |
-
"Alignment-Attribute": 52.8,
|
9 |
-
"Alignment-Action": 47.1,
|
10 |
-
"Alignment-Location": 57.9,
|
11 |
-
"Alignment-Count": 53.6,
|
12 |
-
"Alignment-Avg": 51.1,
|
13 |
-
"Safety-Toxicity-Crime": 96.6,
|
14 |
-
"Safety-Toxicity-Shocking": 96.6,
|
15 |
-
"Safety-Toxicity-Disgust": 95.2,
|
16 |
-
"Safety-Toxicity-Avg": 95.5,
|
17 |
-
"Safety-Nsfw-Evident": 31.1,
|
18 |
-
"Safety-Nsfw-Evasive": 10.2,
|
19 |
-
"Safety-Nsfw-Subtle": 27.4,
|
20 |
-
"Safety-Nsfw-Avg": 18.2,
|
21 |
-
"Quality-Distortion-Human_face": 31.4,
|
22 |
-
"Quality-Distortion-Human_limb": 34.4,
|
23 |
-
"Quality-Distortion-Object": 40.2,
|
24 |
-
"Quality-Distortion-Avg": 33.3,
|
25 |
-
"Quality-Blurry-Defocused": 77.4,
|
26 |
-
"Quality-Blurry-Motion": 86.6,
|
27 |
-
"Quality-Blurry-Avg": 82.1,
|
28 |
-
"Bias-Age": 41.8,
|
29 |
-
"Bias-Gender": 40.4,
|
30 |
-
"Bias-Race": 36.8,
|
31 |
-
"Bias-Nationality": 39.5,
|
32 |
-
"Bias-Religion": 52.8,
|
33 |
-
"Bias-Avg": 40.4,
|
34 |
-
"Bias-Age-NDS": 73.9,
|
35 |
-
"Bias-Gender-NDS": 73.2,
|
36 |
-
"Bias-Race-NDS": 70.9,
|
37 |
-
"Bias-Nationality-NDS": 73.0,
|
38 |
-
"Bias-Religion-NDS": 80.2,
|
39 |
-
"Bias-Avg-NDS": 73.2,
|
40 |
-
"Bias-Age-GES": 85.5,
|
41 |
-
"Bias-Gender-GES": 85.0,
|
42 |
-
"Bias-Race-GES": 83.6,
|
43 |
-
"Bias-Nationality-GES": 84.8,
|
44 |
-
"Bias-Religion-GES": 89.0,
|
45 |
-
"Bias-Avg-GES": 85.0
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/Instructblip-7b.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Instructblip-7b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "Salesforce",
|
7 |
-
"Alignment-Object": 17.1,
|
8 |
-
"Alignment-Attribute": 17.4,
|
9 |
-
"Alignment-Action": 16.2,
|
10 |
-
"Alignment-Location": 13.1,
|
11 |
-
"Alignment-Count": 21.4,
|
12 |
-
"Alignment-Avg": 17.1,
|
13 |
-
"Safety-Toxicity-Crime": 31.0,
|
14 |
-
"Safety-Toxicity-Shocking": 34.5,
|
15 |
-
"Safety-Toxicity-Disgust": 40.5,
|
16 |
-
"Safety-Toxicity-Avg": 39.3,
|
17 |
-
"Safety-Nsfw-Evident": 36.9,
|
18 |
-
"Safety-Nsfw-Evasive": 24.2,
|
19 |
-
"Safety-Nsfw-Subtle": 30.6,
|
20 |
-
"Safety-Nsfw-Avg": 33.7,
|
21 |
-
"Quality-Distortion-Human_face": 12.4,
|
22 |
-
"Quality-Distortion-Human_limb": 9.3,
|
23 |
-
"Quality-Distortion-Object": 21.0,
|
24 |
-
"Quality-Distortion-Avg": 13.3,
|
25 |
-
"Quality-Blurry-Defocused": 32.3,
|
26 |
-
"Quality-Blurry-Motion": 31.1,
|
27 |
-
"Quality-Blurry-Avg": 31.7,
|
28 |
-
"Bias-Age": 52.5,
|
29 |
-
"Bias-Gender": 53.6,
|
30 |
-
"Bias-Race": 53.6,
|
31 |
-
"Bias-Nationality": 52.0,
|
32 |
-
"Bias-Religion": 61.1,
|
33 |
-
"Bias-Avg": 53.6,
|
34 |
-
"Bias-Age-NDS": 80.8,
|
35 |
-
"Bias-Gender-NDS": 80.6,
|
36 |
-
"Bias-Race-NDS": 80.3,
|
37 |
-
"Bias-Nationality-NDS": 79.0,
|
38 |
-
"Bias-Religion-NDS": 85.4,
|
39 |
-
"Bias-Avg-NDS": 80.6,
|
40 |
-
"Bias-Age-GES": 91.0,
|
41 |
-
"Bias-Gender-GES": 91.2,
|
42 |
-
"Bias-Race-GES": 91.1,
|
43 |
-
"Bias-Nationality-GES": 90.4,
|
44 |
-
"Bias-Religion-GES": 93.8,
|
45 |
-
"Bias-Avg-GES": 91.1
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "InternVL-Chat-V1-5",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "OpenGVLab",
|
7 |
-
"Alignment-Object": 73.3,
|
8 |
-
"Alignment-Attribute": 74.8,
|
9 |
-
"Alignment-Action": 78.6,
|
10 |
-
"Alignment-Location": 80.5,
|
11 |
-
"Alignment-Count": 78.6,
|
12 |
-
"Alignment-Avg": 75.8,
|
13 |
-
"Safety-Toxicity-Crime": 34.5,
|
14 |
-
"Safety-Toxicity-Shocking": 10.3,
|
15 |
-
"Safety-Toxicity-Disgust": 28.6,
|
16 |
-
"Safety-Toxicity-Avg": 25.8,
|
17 |
-
"Safety-Nsfw-Evident": 23.3,
|
18 |
-
"Safety-Nsfw-Evasive": 10.6,
|
19 |
-
"Safety-Nsfw-Subtle": 7.2,
|
20 |
-
"Safety-Nsfw-Avg": 16.2,
|
21 |
-
"Quality-Distortion-Human_face": 97.0,
|
22 |
-
"Quality-Distortion-Human_limb": 95.4,
|
23 |
-
"Quality-Distortion-Object": 97.1,
|
24 |
-
"Quality-Distortion-Avg": 97.1,
|
25 |
-
"Quality-Blurry-Defocused": 89.7,
|
26 |
-
"Quality-Blurry-Motion": 89.7,
|
27 |
-
"Quality-Blurry-Avg": 89.7,
|
28 |
-
"Bias-Age": 40.0,
|
29 |
-
"Bias-Gender": 41.3,
|
30 |
-
"Bias-Race": 42.1,
|
31 |
-
"Bias-Nationality": 42.0,
|
32 |
-
"Bias-Religion": 39.8,
|
33 |
-
"Bias-Avg": 41.3,
|
34 |
-
"Bias-Age-NDS": 74.0,
|
35 |
-
"Bias-Gender-NDS": 74.1,
|
36 |
-
"Bias-Race-NDS": 73.6,
|
37 |
-
"Bias-Nationality-NDS": 73.9,
|
38 |
-
"Bias-Religion-NDS": 76.6,
|
39 |
-
"Bias-Avg-NDS": 74.1,
|
40 |
-
"Bias-Age-GES": 86.9,
|
41 |
-
"Bias-Gender-GES": 87.2,
|
42 |
-
"Bias-Race-GES": 87.1,
|
43 |
-
"Bias-Nationality-GES": 87.3,
|
44 |
-
"Bias-Religion-GES": 88.0,
|
45 |
-
"Bias-Avg-GES": 87.2
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "LLaVA-1.5-13b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "UW-Madison & Microsoft",
|
7 |
-
"Alignment-Object": 17.7,
|
8 |
-
"Alignment-Attribute": 13.5,
|
9 |
-
"Alignment-Action": 11.8,
|
10 |
-
"Alignment-Location": 16.5,
|
11 |
-
"Alignment-Count": 8.9,
|
12 |
-
"Alignment-Avg": 10.3,
|
13 |
-
"Safety-Toxicity-Crime": 31.0,
|
14 |
-
"Safety-Toxicity-Shocking": 31.0,
|
15 |
-
"Safety-Toxicity-Disgust": 40.5,
|
16 |
-
"Safety-Toxicity-Avg": 33.7,
|
17 |
-
"Safety-Nsfw-Evident": 40.8,
|
18 |
-
"Safety-Nsfw-Evasive": 29.9,
|
19 |
-
"Safety-Nsfw-Subtle": 33.6,
|
20 |
-
"Safety-Nsfw-Avg": 34.7,
|
21 |
-
"Quality-Distortion-Human_face": 20.1,
|
22 |
-
"Quality-Distortion-Human_limb": 14.6,
|
23 |
-
"Quality-Distortion-Object": 13.3,
|
24 |
-
"Quality-Distortion-Avg": 16.4,
|
25 |
-
"Quality-Blurry-Defocused": 18.0,
|
26 |
-
"Quality-Blurry-Motion": 34.0,
|
27 |
-
"Quality-Blurry-Avg": 26.1,
|
28 |
-
"Bias-Age": 67.0,
|
29 |
-
"Bias-Gender": 70.1,
|
30 |
-
"Bias-Race": 68.9,
|
31 |
-
"Bias-Nationality": 72.7,
|
32 |
-
"Bias-Religion": 75.1,
|
33 |
-
"Bias-Avg": 70.1,
|
34 |
-
"Bias-Age-NDS": 71.9,
|
35 |
-
"Bias-Gender-NDS": 74.8,
|
36 |
-
"Bias-Race-NDS": 76.6,
|
37 |
-
"Bias-Nationality-NDS": 74.0,
|
38 |
-
"Bias-Religion-NDS": 80.6,
|
39 |
-
"Bias-Avg-NDS": 74.8,
|
40 |
-
"Bias-Age-GES": 87.5,
|
41 |
-
"Bias-Gender-GES": 88.8,
|
42 |
-
"Bias-Race-GES": 88.9,
|
43 |
-
"Bias-Nationality-GES": 89.5,
|
44 |
-
"Bias-Religion-GES": 90.1,
|
45 |
-
"Bias-Avg-GES": 88.8
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "LLaVA-1.5-7b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "UW-Madison & Microsoft",
|
7 |
-
"Alignment-Object": 20.7,
|
8 |
-
"Alignment-Attribute": 25.2,
|
9 |
-
"Alignment-Action": 23.1,
|
10 |
-
"Alignment-Location": 18.2,
|
11 |
-
"Alignment-Count": 17.9,
|
12 |
-
"Alignment-Avg": 22.0,
|
13 |
-
"Safety-Toxicity-Crime": 44.8,
|
14 |
-
"Safety-Toxicity-Shocking": 41.4,
|
15 |
-
"Safety-Toxicity-Disgust": 47.6,
|
16 |
-
"Safety-Toxicity-Avg": 43.8,
|
17 |
-
"Safety-Nsfw-Evident": 35.7,
|
18 |
-
"Safety-Nsfw-Evasive": 21.2,
|
19 |
-
"Safety-Nsfw-Subtle": 17.6,
|
20 |
-
"Safety-Nsfw-Avg": 26.3,
|
21 |
-
"Quality-Distortion-Human_face": 13.6,
|
22 |
-
"Quality-Distortion-Human_limb": 7.3,
|
23 |
-
"Quality-Distortion-Object": 9.2,
|
24 |
-
"Quality-Distortion-Avg": 10.2,
|
25 |
-
"Quality-Blurry-Defocused": 7.1,
|
26 |
-
"Quality-Blurry-Motion": 19.1,
|
27 |
-
"Quality-Blurry-Avg": 13.1,
|
28 |
-
"Bias-Age": 80.8,
|
29 |
-
"Bias-Gender": 83.9,
|
30 |
-
"Bias-Race": 84.6,
|
31 |
-
"Bias-Nationality": 84.9,
|
32 |
-
"Bias-Religion": 88.1,
|
33 |
-
"Bias-Avg": 84.0,
|
34 |
-
"Bias-Age-NDS": 67.6,
|
35 |
-
"Bias-Gender-NDS": 71.4,
|
36 |
-
"Bias-Race-NDS": 75.8,
|
37 |
-
"Bias-Nationality-NDS": 68.4,
|
38 |
-
"Bias-Religion-NDS": 77.3,
|
39 |
-
"Bias-Avg-NDS": 71.4,
|
40 |
-
"Bias-Age-GES": 87.4,
|
41 |
-
"Bias-Gender-GES": 88.9,
|
42 |
-
"Bias-Race-GES": 90.1,
|
43 |
-
"Bias-Nationality-GES": 88.7,
|
44 |
-
"Bias-Religion-GES": 90.7,
|
45 |
-
"Bias-Avg-GES": 88.9
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "LLaVA-NeXT-mistral-7b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "UW-Madison & ByteDance",
|
7 |
-
"Alignment-Object": 25.9,
|
8 |
-
"Alignment-Attribute": 30.0,
|
9 |
-
"Alignment-Action": 41.9,
|
10 |
-
"Alignment-Location": 33.8,
|
11 |
-
"Alignment-Count": 35.7,
|
12 |
-
"Alignment-Avg": 31.3,
|
13 |
-
"Safety-Toxicity-Crime": 20.7,
|
14 |
-
"Safety-Toxicity-Shocking": 24.1,
|
15 |
-
"Safety-Toxicity-Disgust": 19.0,
|
16 |
-
"Safety-Toxicity-Avg": 21.3,
|
17 |
-
"Safety-Nsfw-Evident": 35.7,
|
18 |
-
"Safety-Nsfw-Evasive": 14.1,
|
19 |
-
"Safety-Nsfw-Subtle": 23.3,
|
20 |
-
"Safety-Nsfw-Avg": 25.6,
|
21 |
-
"Quality-Distortion-Human_face": 28.4,
|
22 |
-
"Quality-Distortion-Human_limb": 27.8,
|
23 |
-
"Quality-Distortion-Object": 19.0,
|
24 |
-
"Quality-Distortion-Avg": 30.1,
|
25 |
-
"Quality-Blurry-Defocused": 41.7,
|
26 |
-
"Quality-Blurry-Motion": 66.1,
|
27 |
-
"Quality-Blurry-Avg": 53.9,
|
28 |
-
"Bias-Age": 54.3,
|
29 |
-
"Bias-Gender": 56.7,
|
30 |
-
"Bias-Race": 57.0,
|
31 |
-
"Bias-Nationality": 56.1,
|
32 |
-
"Bias-Religion": 64.8,
|
33 |
-
"Bias-Avg": 56.6,
|
34 |
-
"Bias-Age-NDS": 63.2,
|
35 |
-
"Bias-Gender-NDS": 64.1,
|
36 |
-
"Bias-Race-NDS": 62.5,
|
37 |
-
"Bias-Nationality-NDS": 63.8,
|
38 |
-
"Bias-Religion-NDS": 74.2,
|
39 |
-
"Bias-Avg-NDS": 64.1,
|
40 |
-
"Bias-Age-GES": 82.1,
|
41 |
-
"Bias-Gender-GES": 82.8,
|
42 |
-
"Bias-Race-GES": 82.4,
|
43 |
-
"Bias-Nationality-GES": 82.5,
|
44 |
-
"Bias-Religion-GES": 87.8,
|
45 |
-
"Bias-Avg-GES": 82.8
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "LLaVA-NeXT-vicuna-13b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "UW-Madison & ByteDance",
|
7 |
-
"Alignment-Object": 25.9,
|
8 |
-
"Alignment-Attribute": 27.4,
|
9 |
-
"Alignment-Action": 31.6,
|
10 |
-
"Alignment-Location": 38.9,
|
11 |
-
"Alignment-Count": 32.1,
|
12 |
-
"Alignment-Avg": 29.1,
|
13 |
-
"Safety-Toxicity-Crime": 44.8,
|
14 |
-
"Safety-Toxicity-Shocking": 37.9,
|
15 |
-
"Safety-Toxicity-Disgust": 52.4,
|
16 |
-
"Safety-Toxicity-Avg": 43.8,
|
17 |
-
"Safety-Nsfw-Evident": 40.9,
|
18 |
-
"Safety-Nsfw-Evasive": 25.1,
|
19 |
-
"Safety-Nsfw-Subtle": 27.8,
|
20 |
-
"Safety-Nsfw-Avg": 36.5,
|
21 |
-
"Quality-Distortion-Human_face": 18.9,
|
22 |
-
"Quality-Distortion-Human_limb": 27.8,
|
23 |
-
"Quality-Distortion-Object": 12.0,
|
24 |
-
"Quality-Distortion-Avg": 20.5,
|
25 |
-
"Quality-Blurry-Defocused": 40.6,
|
26 |
-
"Quality-Blurry-Motion": 45.4,
|
27 |
-
"Quality-Blurry-Avg": 43.0,
|
28 |
-
"Bias-Age": 54.3,
|
29 |
-
"Bias-Gender": 56.7,
|
30 |
-
"Bias-Race": 57.0,
|
31 |
-
"Bias-Nationality": 56.1,
|
32 |
-
"Bias-Religion": 64.8,
|
33 |
-
"Bias-Avg": 56.6
|
34 |
-
}
|
35 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/MiniGPT4-v2.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "MiniGPT4-v2",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "Vision-CAIR",
|
7 |
-
"Alignment-Object": 37.5,
|
8 |
-
"Alignment-Attribute": 30.9,
|
9 |
-
"Alignment-Action": 30.8,
|
10 |
-
"Alignment-Location": 32.5,
|
11 |
-
"Alignment-Count": 39.3,
|
12 |
-
"Alignment-Avg": 32.8,
|
13 |
-
"Safety-Toxicity-Crime": 41.4,
|
14 |
-
"Safety-Toxicity-Shocking": 62.1,
|
15 |
-
"Safety-Toxicity-Disgust": 42.9,
|
16 |
-
"Safety-Toxicity-Avg": 48.3,
|
17 |
-
"Safety-Nsfw-Evident": 39.6,
|
18 |
-
"Safety-Nsfw-Evasive": 21.4,
|
19 |
-
"Safety-Nsfw-Subtle": 36.5,
|
20 |
-
"Safety-Nsfw-Avg": 32.6,
|
21 |
-
"Quality-Distortion-Human_face": 39.6,
|
22 |
-
"Quality-Distortion-Human_limb": 39.1,
|
23 |
-
"Quality-Distortion-Object": 42.0,
|
24 |
-
"Quality-Distortion-Avg": 40.0,
|
25 |
-
"Quality-Blurry-Defocused": 33.4,
|
26 |
-
"Quality-Blurry-Motion": 37.4,
|
27 |
-
"Quality-Blurry-Avg": 35.4,
|
28 |
-
"Bias-Age": 31.8,
|
29 |
-
"Bias-Gender": 32.2,
|
30 |
-
"Bias-Race": 31.9,
|
31 |
-
"Bias-Nationality": 34.1,
|
32 |
-
"Bias-Religion": 28.3,
|
33 |
-
"Bias-Avg": 32.2,
|
34 |
-
"Bias-Age-NDS": 68.1,
|
35 |
-
"Bias-Gender-NDS": 67.2,
|
36 |
-
"Bias-Race-NDS": 66.2,
|
37 |
-
"Bias-Nationality-NDS": 67.0,
|
38 |
-
"Bias-Religion-NDS": 69.3,
|
39 |
-
"Bias-Avg-NDS": 67.2,
|
40 |
-
"Bias-Age-GES": 83.7,
|
41 |
-
"Bias-Gender-GES": 83.3,
|
42 |
-
"Bias-Race-GES": 82.8,
|
43 |
-
"Bias-Nationality-GES": 83.4,
|
44 |
-
"Bias-Religion-GES": 84.1,
|
45 |
-
"Bias-Avg-GES": 83.3
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/PickScore-v1.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "PickScore-v1",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "Stability AI",
|
7 |
-
"Alignment-Object": 60.9,
|
8 |
-
"Alignment-Attribute": 60.3,
|
9 |
-
"Alignment-Action": 62.4,
|
10 |
-
"Alignment-Location": 59.2,
|
11 |
-
"Alignment-Count": 67.9,
|
12 |
-
"Alignment-Avg": 60.9,
|
13 |
-
"Safety-Toxicity-Crime": 89.7,
|
14 |
-
"Safety-Toxicity-Shocking": 82.8,
|
15 |
-
"Safety-Toxicity-Disgust": 88.1,
|
16 |
-
"Safety-Toxicity-Avg": 86.5,
|
17 |
-
"Safety-Nsfw-Evident": 3.1,
|
18 |
-
"Safety-Nsfw-Evasive": 48.2,
|
19 |
-
"Safety-Nsfw-Subtle": 2.1,
|
20 |
-
"Safety-Nsfw-Avg": 32.2,
|
21 |
-
"Quality-Distortion-Human_face": 83.4,
|
22 |
-
"Quality-Distortion-Human_limb": 68.2,
|
23 |
-
"Quality-Distortion-Object": 92.1,
|
24 |
-
"Quality-Distortion-Avg": 79.3,
|
25 |
-
"Quality-Blurry-Defocused": 80.6,
|
26 |
-
"Quality-Blurry-Motion": 93.4,
|
27 |
-
"Quality-Blurry-Avg": 86.6,
|
28 |
-
"Bias-Age": 30.4,
|
29 |
-
"Bias-Gender": 31.1,
|
30 |
-
"Bias-Race": 30.8,
|
31 |
-
"Bias-Nationality": 31.7,
|
32 |
-
"Bias-Religion": 33.0,
|
33 |
-
"Bias-Avg": 31.1,
|
34 |
-
"Bias-Age-NDS": 65.3,
|
35 |
-
"Bias-Gender-NDS": 66.7,
|
36 |
-
"Bias-Race-NDS": 66.4,
|
37 |
-
"Bias-Nationality-NDS": 67.3,
|
38 |
-
"Bias-Religion-NDS": 69.4,
|
39 |
-
"Bias-Avg-NDS": 66.7,
|
40 |
-
"Bias-Age-GES": 80.5,
|
41 |
-
"Bias-Gender-GES": 81.2,
|
42 |
-
"Bias-Race-GES": 81.0,
|
43 |
-
"Bias-Nationality-GES": 81.6,
|
44 |
-
"Bias-Religion-GES": 82.6,
|
45 |
-
"Bias-Avg-GES": 81.2
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Prometheus-Vision-13b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "prometheus-eval",
|
7 |
-
"Alignment-Object": 14.3,
|
8 |
-
"Alignment-Attribute": 10.9,
|
9 |
-
"Alignment-Action": 9.4,
|
10 |
-
"Alignment-Location": 11.7,
|
11 |
-
"Alignment-Count": 16.1,
|
12 |
-
"Alignment-Avg": 11.8,
|
13 |
-
"Safety-Toxicity-Crime": 0.0,
|
14 |
-
"Safety-Toxicity-Shocking": 0.0,
|
15 |
-
"Safety-Toxicity-Disgust": 0.0,
|
16 |
-
"Safety-Toxicity-Avg": 0.0,
|
17 |
-
"Safety-Nsfw-Evident": 6.5,
|
18 |
-
"Safety-Nsfw-Evasive": 4.1,
|
19 |
-
"Safety-Nsfw-Subtle": 4.2,
|
20 |
-
"Safety-Nsfw-Avg": 5.3,
|
21 |
-
"Quality-Distortion-Human_face": 7.1,
|
22 |
-
"Quality-Distortion-Human_limb": 4.6,
|
23 |
-
"Quality-Distortion-Object": 7.2,
|
24 |
-
"Quality-Distortion-Avg": 6.2,
|
25 |
-
"Quality-Blurry-Defocused": 9.4,
|
26 |
-
"Quality-Blurry-Motion": 10.6,
|
27 |
-
"Quality-Blurry-Avg": 10.0,
|
28 |
-
"Bias-Age": 65.1,
|
29 |
-
"Bias-Gender": 65.8,
|
30 |
-
"Bias-Race": 63.4,
|
31 |
-
"Bias-Nationality": 65.7,
|
32 |
-
"Bias-Religion": 77.1,
|
33 |
-
"Bias-Avg": 65.8,
|
34 |
-
"Bias-Age-NDS": 54.2,
|
35 |
-
"Bias-Gender-NDS": 44.7,
|
36 |
-
"Bias-Race-NDS": 36.0,
|
37 |
-
"Bias-Nationality-NDS": 39.3,
|
38 |
-
"Bias-Religion-NDS": 65.7,
|
39 |
-
"Bias-Avg-NDS": 44.7,
|
40 |
-
"Bias-Age-GES": 79.2,
|
41 |
-
"Bias-Gender-GES": 76.0,
|
42 |
-
"Bias-Race-GES": 72.7,
|
43 |
-
"Bias-Nationality-GES": 74.1,
|
44 |
-
"Bias-Religion-GES": 85.1,
|
45 |
-
"Bias-Avg-GES": 76.0
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Prometheus-Vision-7b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "prometheus-eval",
|
7 |
-
"Alignment-Object": 19.5,
|
8 |
-
"Alignment-Attribute": 15.2,
|
9 |
-
"Alignment-Action": 16.2,
|
10 |
-
"Alignment-Location": 22.1,
|
11 |
-
"Alignment-Count": 26.8,
|
12 |
-
"Alignment-Avg": 18.8,
|
13 |
-
"Safety-Toxicity-Crime": 0.0,
|
14 |
-
"Safety-Toxicity-Shocking": 0.0,
|
15 |
-
"Safety-Toxicity-Disgust": 0.0,
|
16 |
-
"Safety-Toxicity-Avg": 0.0,
|
17 |
-
"Safety-Nsfw-Evident": 10.3,
|
18 |
-
"Safety-Nsfw-Evasive": 6.8,
|
19 |
-
"Safety-Nsfw-Subtle": 4.3,
|
20 |
-
"Safety-Nsfw-Avg": 7.1,
|
21 |
-
"Quality-Distortion-Human_face": 16.6,
|
22 |
-
"Quality-Distortion-Human_limb": 17.9,
|
23 |
-
"Quality-Distortion-Object": 14.1,
|
24 |
-
"Quality-Distortion-Avg": 16.4,
|
25 |
-
"Quality-Blurry-Defocused": 22.3,
|
26 |
-
"Quality-Blurry-Motion": 30.3,
|
27 |
-
"Quality-Blurry-Avg": 26.3,
|
28 |
-
"Bias-Age": 43.8,
|
29 |
-
"Bias-Gender": 50.4,
|
30 |
-
"Bias-Race": 54.4,
|
31 |
-
"Bias-Nationality": 53.6,
|
32 |
-
"Bias-Religion": 44.9,
|
33 |
-
"Bias-Avg": 50.4,
|
34 |
-
"Bias-Age-NDS": 47.2,
|
35 |
-
"Bias-Gender-NDS": 42.5,
|
36 |
-
"Bias-Race-NDS": 37.8,
|
37 |
-
"Bias-Nationality-NDS": 40.0,
|
38 |
-
"Bias-Religion-NDS": 54.2,
|
39 |
-
"Bias-Avg-NDS": 42.5,
|
40 |
-
"Bias-Age-GES": 74.9,
|
41 |
-
"Bias-Gender-GES": 74.3,
|
42 |
-
"Bias-Race-GES": 73.1,
|
43 |
-
"Bias-Nationality-GES": 74.2,
|
44 |
-
"Bias-Religion-GES": 77.3,
|
45 |
-
"Bias-Avg-GES": 74.3
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/detailed-results/Qwen-VL-Chat.json
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Qwen-VL-Chat",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "Alibaba",
|
7 |
-
"Alignment-Object": 30.7,
|
8 |
-
"Alignment-Attribute": 29.1,
|
9 |
-
"Alignment-Action": 35.9,
|
10 |
-
"Alignment-Location": 29.9,
|
11 |
-
"Alignment-Count": 32.1,
|
12 |
-
"Alignment-Avg": 31.1,
|
13 |
-
"Safety-Toxicity-Crime": 27.6,
|
14 |
-
"Safety-Toxicity-Shocking": 13.8,
|
15 |
-
"Safety-Toxicity-Disgust": 31.0,
|
16 |
-
"Safety-Toxicity-Avg": 24.7,
|
17 |
-
"Safety-Nsfw-Evident": 18.9,
|
18 |
-
"Safety-Nsfw-Evasive": 7.6,
|
19 |
-
"Safety-Nsfw-Subtle": 6.3,
|
20 |
-
"Safety-Nsfw-Avg": 11.6,
|
21 |
-
"Quality-Distortion-Human_face": 14.2,
|
22 |
-
"Quality-Distortion-Human_limb": 15.9,
|
23 |
-
"Quality-Distortion-Object": 9.4,
|
24 |
-
"Quality-Distortion-Avg": 13.6,
|
25 |
-
"Quality-Blurry-Defocused": 0.9,
|
26 |
-
"Quality-Blurry-Motion": 2.1,
|
27 |
-
"Quality-Blurry-Avg": 1.4,
|
28 |
-
"Bias-Age": 70.8,
|
29 |
-
"Bias-Gender": 71.5,
|
30 |
-
"Bias-Race": 72.3,
|
31 |
-
"Bias-Nationality": 72.2,
|
32 |
-
"Bias-Religion": 68.1,
|
33 |
-
"Bias-Avg": 71.5,
|
34 |
-
"Bias-Age-NDS": 62.4,
|
35 |
-
"Bias-Gender-NDS": 62.3,
|
36 |
-
"Bias-Race-NDS": 62.3,
|
37 |
-
"Bias-Nationality-NDS": 63.1,
|
38 |
-
"Bias-Religion-NDS": 58.9,
|
39 |
-
"Bias-Avg-NDS": 62.3,
|
40 |
-
"Bias-Age-GES": 85.9,
|
41 |
-
"Bias-Gender-GES": 86.0,
|
42 |
-
"Bias-Race-GES": 86.0,
|
43 |
-
"Bias-Nationality-GES": 86.4,
|
44 |
-
"Bias-Religion-GES": 83.8,
|
45 |
-
"Bias-Avg-GES": 85.9
|
46 |
-
}
|
47 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/AestheticsPredictor.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "AestheticsPredictor",
|
4 |
-
"Model Type": "Score Model",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "LAION",
|
7 |
-
"Alignment": 32.4,
|
8 |
-
"Safety": 27.0,
|
9 |
-
"Quality": 69.6,
|
10 |
-
"Bias": 61.4
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/BLIP-v2.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "BLIP-v2",
|
4 |
-
"Model Type": "Score Model",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "Salesforce",
|
7 |
-
"Alignment": 17.3,
|
8 |
-
"Safety": 44.0,
|
9 |
-
"Quality": 7.5,
|
10 |
-
"Bias": 68.7
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/CLIP-v2.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "CLIP-v2",
|
4 |
-
"Model Type": "Score Model",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "LAION",
|
7 |
-
"Alignment": 38.1,
|
8 |
-
"Safety": 12.7,
|
9 |
-
"Quality": 34.4,
|
10 |
-
"Bias": 57.4
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/Claude 3 Opus.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Claude 3 Opus",
|
4 |
-
"Model Type": "Closesource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "Anthropic",
|
7 |
-
"Alignment": 57.1,
|
8 |
-
"Safety": 13.4,
|
9 |
-
"Quality": 11.9,
|
10 |
-
"Bias": 57.7
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/GPT-4-vision.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "GPT-4-vision",
|
4 |
-
"Model Type": "Closesource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "OpenAI",
|
7 |
-
"Alignment": 66.1,
|
8 |
-
"Safety": 26.5,
|
9 |
-
"Quality": 90.4,
|
10 |
-
"Bias": 79.0
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/GPT-4o.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "GPT-4o",
|
4 |
-
"Model Type": "Closesource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "OpenAI",
|
7 |
-
"Alignment": 61.5,
|
8 |
-
"Safety": 35.3,
|
9 |
-
"Quality": 97.6,
|
10 |
-
"Bias": 65.8
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/Gemini Ultra.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Gemini Ultra",
|
4 |
-
"Model Type": "Closesource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "Google",
|
7 |
-
"Alignment": 67.2,
|
8 |
-
"Safety": 13.1,
|
9 |
-
"Quality": 55.7,
|
10 |
-
"Bias": 55.6
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/HPS-v2.1.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "HPS-v2.1",
|
4 |
-
"Model Type": "Score Model",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "CUHK MMLab",
|
7 |
-
"Alignment": 47.3,
|
8 |
-
"Safety": 18.8,
|
9 |
-
"Quality": 67.3,
|
10 |
-
"Bias": 55.0
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/Idefics2-8b.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Idefics2-8b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "HuggingFace",
|
7 |
-
"Alignment": 32.6,
|
8 |
-
"Safety": 13.6,
|
9 |
-
"Quality": 46.1,
|
10 |
-
"Bias": 42.1
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/ImageReward.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "ImageReward",
|
4 |
-
"Model Type": "Score Model",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "THUDM",
|
7 |
-
"Alignment": 50.9,
|
8 |
-
"Safety": 24.9,
|
9 |
-
"Quality": 63.5,
|
10 |
-
"Bias": 40.9
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/Instructblip-7b.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Instructblip-7b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "Salesforce",
|
7 |
-
"Alignment": 17.1,
|
8 |
-
"Safety": 26.4,
|
9 |
-
"Quality": 25.2,
|
10 |
-
"Bias": 53.1
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "InternVL-Chat-V1-5",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "OpenGVLab",
|
7 |
-
"Alignment": 55.3,
|
8 |
-
"Safety": 6.3,
|
9 |
-
"Quality": 66.3,
|
10 |
-
"Bias": 25.4
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/LLaVA-1.5-13b.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "LLaVA-1.5-13b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "UW-Madison & Microsoft",
|
7 |
-
"Alignment": 10.3,
|
8 |
-
"Safety": 30.7,
|
9 |
-
"Quality": 23.3,
|
10 |
-
"Bias": 69.7
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/LLaVA-1.5-7b.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "LLaVA-1.5-7b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "UW-Madison & Microsoft",
|
7 |
-
"Alignment": 22.0,
|
8 |
-
"Safety": 24.8,
|
9 |
-
"Quality": 12.4,
|
10 |
-
"Bias": 83.7
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "LLaVA-NeXT-mistral-7b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "UW-Madison & ByteDance",
|
7 |
-
"Alignment": 31.3,
|
8 |
-
"Safety": 15.2,
|
9 |
-
"Quality": 45.8,
|
10 |
-
"Bias": 69.9
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "LLaVA-NeXT-vicuna-13b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "UW-Madison & ByteDance",
|
7 |
-
"Alignment": 29.1,
|
8 |
-
"Safety": 27.9,
|
9 |
-
"Quality": 36.8,
|
10 |
-
"Bias": 56.3
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/MiniGPT4-v2.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "MiniGPT4-v2",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "Vision-CAIR",
|
7 |
-
"Alignment": 32.8,
|
8 |
-
"Safety": 25.7,
|
9 |
-
"Quality": 36.7,
|
10 |
-
"Bias": 32.6
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/PickScore-v1.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "PickScore-v1",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "Stability AI",
|
7 |
-
"Alignment": 58.8,
|
8 |
-
"Safety": 37.2,
|
9 |
-
"Quality": 83.8,
|
10 |
-
"Bias": 31.0
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/Prometheus-Vision-13b.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Prometheus-Vision-13b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "prometheus-eval",
|
7 |
-
"Alignment": 11.8,
|
8 |
-
"Safety": 3.6,
|
9 |
-
"Quality": 8.7,
|
10 |
-
"Bias": 66.3
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/Prometheus-Vision-7b.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Prometheus-Vision-7b",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Single Image",
|
6 |
-
"Organization": "prometheus-eval",
|
7 |
-
"Alignment": 18.8,
|
8 |
-
"Safety": 7.1,
|
9 |
-
"Quality": 23.4,
|
10 |
-
"Bias": 49.5
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench-results/overall-results/Qwen-VL-Chat.json
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"Model": "Qwen-VL-Chat",
|
4 |
-
"Model Type": "Opensource VLM",
|
5 |
-
"Input Type": "Multi Image",
|
6 |
-
"Organization": "Alibaba",
|
7 |
-
"Alignment": 52.1,
|
8 |
-
"Safety": 26.8,
|
9 |
-
"Quality": 23.6,
|
10 |
-
"Bias": 71.9
|
11 |
-
}
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/about.py
CHANGED
@@ -21,15 +21,14 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
-
MJB_LOGO = '<img src="" alt="Logo" style="width: 30%; display: block; margin: auto;">'
|
27 |
|
28 |
# What does your leaderboard evaluate?
|
29 |
INTRODUCTION_TEXT = """
|
30 |
-
#
|
31 |
-
|
32 |
-
[Website](https://mj-bench.github.io) | [Code](https://github.com/MJ-Bench/MJ-Bench) | [Eval. Dataset](https://huggingface.co/datasets/MJ-Bench/MJ-Bench) | [Results](https://huggingface.co/datasets/MJ-Bench/MJ-Bench-Results) | [Refined Model via RMs](https://huggingface.co/collections/MJ-Bench/aligned-diffusion-model-via-dpo-667f8b71f35c3ff47acafd43) | [Paper](https://arxiv.org/abs/2407.04842) | Total models: {}
|
33 |
"""
|
34 |
|
35 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
+
TITLE = """<h1 align="center" id="space-title">MMIE</h1>"""
|
25 |
|
26 |
+
# MJB_LOGO = '<img src="" alt="Logo" style="width: 30%; display: block; margin: auto;">'
|
27 |
|
28 |
# What does your leaderboard evaluate?
|
29 |
INTRODUCTION_TEXT = """
|
30 |
+
# MMIE: Massive Multimodal Interleaved Comprehension Benchmark for Large Vision-Language Models
|
31 |
+
[Website](https://github.com/richard-peng-xia/MMIE) | [Code](https://github.com/richard-peng-xia/MMIE) | [Dataset](https://huggingface.co/datasets/MMIE/MMIE) | [Results](https://huggingface.co/datasets/MMIE/MMIE-Leaderboard) | [Eval Model](https://huggingface.co/MMIE/MMIE-Eval) | [Paper]()
|
|
|
32 |
"""
|
33 |
|
34 |
# Which evaluations are you running? how can people reproduce what you have?
|
src/envs.py
CHANGED
@@ -9,9 +9,9 @@ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
|
9 |
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
-
REPO_ID = f"
|
13 |
-
QUEUE_REPO = f"
|
14 |
-
RESULTS_REPO = f"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
9 |
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
+
REPO_ID = f"MMIE/MMIE-Leaderboard"
|
13 |
+
QUEUE_REPO = f"MMIE/MMIE-Requests"
|
14 |
+
RESULTS_REPO = f"MMIE/MMIE-Results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
src/logo.png
ADDED
![]() |