Spaces:
Running
Running
update mj-bench
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- app.py +11 -2
- evals/.gitattributes +55 -0
- evals/README.md +6 -0
- evals/{mjbench β mjbench-results}/detailed-results/AestheticsPredictor.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/BLIP-v2.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/CLIP-v2.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/Claude 3 Opus.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/GPT-4-vision.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/GPT-4o.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/Gemini Ultra.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/HPS-v2.1.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/Idefics2-8b.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/ImageReward.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/Instructblip-7b.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/InternVL-Chat-V1-5.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/LLaVA-1.5-13b.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/LLaVA-1.5-7b.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/LLaVA-NeXT-mistral-7b.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/LLaVA-NeXT-vicuna-13b.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/MiniGPT4-v2.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/PickScore-v1.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/Prometheus-Vision-13b.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/Prometheus-Vision-7b.json +0 -0
- evals/{mjbench β mjbench-results}/detailed-results/Qwen-VL-Chat.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/AestheticsPredictor.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/BLIP-v2.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/CLIP-v2.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/Claude 3 Opus.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/GPT-4-vision.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/GPT-4o.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/Gemini Ultra.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/HPS-v2.1.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/Idefics2-8b.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/ImageReward.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/Instructblip-7b.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/InternVL-Chat-V1-5.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/LLaVA-1.5-13b.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/LLaVA-1.5-7b.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/LLaVA-NeXT-mistral-7b.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/LLaVA-NeXT-vicuna-13b.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/MiniGPT4-v2.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/PickScore-v1.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/Prometheus-Vision-13b.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/Prometheus-Vision-7b.json +0 -0
- evals/{mjbench β mjbench-results}/overall-results/Qwen-VL-Chat.json +0 -0
- evals/mjbench/latex_reults/alignment_narrative.tex +0 -37
- evals/mjbench/latex_reults/alignment_number_10.tex +0 -29
- evals/mjbench/latex_reults/alignment_number_5.tex +0 -35
- evals/mjbench/latex_reults/artifact_narrative.tex +0 -29
- evals/mjbench/latex_reults/artifact_number_10.tex +0 -38
app.py
CHANGED
@@ -35,6 +35,14 @@ from src.display.utils import (
|
|
35 |
)
|
36 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
SUBSET_COUNTS = {
|
39 |
"Alignment-Object": 250,
|
40 |
"Alignment-Attribute": 229,
|
@@ -71,6 +79,7 @@ PERSPECTIVE_COUNTS= {
|
|
71 |
META_DATA = ['Model', 'Model Type', 'Input Type', 'Organization']
|
72 |
|
73 |
|
|
|
74 |
def restart_space():
|
75 |
API.restart_space(repo_id=REPO_ID)
|
76 |
|
@@ -192,12 +201,12 @@ def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=MET
|
|
192 |
return new_df
|
193 |
|
194 |
|
195 |
-
results_path = Path("
|
196 |
orig_df = get_leaderboard_results(results_path)
|
197 |
colmuns_name = list(SUBSET_COUNTS.keys())
|
198 |
detailed_df = avg_all_subset(orig_df, colmuns_name).round(2)
|
199 |
|
200 |
-
results_path = Path("
|
201 |
orig_df = get_leaderboard_results(results_path)
|
202 |
colmuns_name = list(PERSPECTIVE_COUNTS.keys())
|
203 |
perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2)
|
|
|
35 |
)
|
36 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
37 |
|
38 |
+
try:
|
39 |
+
print(EVAL_RESULTS_PATH)
|
40 |
+
snapshot_download(
|
41 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
42 |
+
)
|
43 |
+
except Exception:
|
44 |
+
restart_space()
|
45 |
+
|
46 |
SUBSET_COUNTS = {
|
47 |
"Alignment-Object": 250,
|
48 |
"Alignment-Attribute": 229,
|
|
|
79 |
META_DATA = ['Model', 'Model Type', 'Input Type', 'Organization']
|
80 |
|
81 |
|
82 |
+
|
83 |
def restart_space():
|
84 |
API.restart_space(repo_id=REPO_ID)
|
85 |
|
|
|
201 |
return new_df
|
202 |
|
203 |
|
204 |
+
results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/detailed-results")
|
205 |
orig_df = get_leaderboard_results(results_path)
|
206 |
colmuns_name = list(SUBSET_COUNTS.keys())
|
207 |
detailed_df = avg_all_subset(orig_df, colmuns_name).round(2)
|
208 |
|
209 |
+
results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/overall-results")
|
210 |
orig_df = get_leaderboard_results(results_path)
|
211 |
colmuns_name = list(PERSPECTIVE_COUNTS.keys())
|
212 |
perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2)
|
evals/.gitattributes
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
+
# Audio files - uncompressed
|
38 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
+
# Audio files - compressed
|
42 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
# Image files - uncompressed
|
48 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
+
# Image files - compressed
|
53 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
evals/README.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
|
3 |
+
# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
|
4 |
+
{}
|
5 |
+
---
|
6 |
+
# Coming Soon
|
evals/{mjbench β mjbench-results}/detailed-results/AestheticsPredictor.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/BLIP-v2.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/CLIP-v2.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/Claude 3 Opus.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/GPT-4-vision.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/GPT-4o.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/Gemini Ultra.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/HPS-v2.1.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/Idefics2-8b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/ImageReward.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/Instructblip-7b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/InternVL-Chat-V1-5.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/LLaVA-1.5-13b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/LLaVA-1.5-7b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/LLaVA-NeXT-mistral-7b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/LLaVA-NeXT-vicuna-13b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/MiniGPT4-v2.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/PickScore-v1.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/Prometheus-Vision-13b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/Prometheus-Vision-7b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/detailed-results/Qwen-VL-Chat.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/AestheticsPredictor.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/BLIP-v2.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/CLIP-v2.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/Claude 3 Opus.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/GPT-4-vision.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/GPT-4o.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/Gemini Ultra.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/HPS-v2.1.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/Idefics2-8b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/ImageReward.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/Instructblip-7b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/InternVL-Chat-V1-5.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/LLaVA-1.5-13b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/LLaVA-1.5-7b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/LLaVA-NeXT-mistral-7b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/LLaVA-NeXT-vicuna-13b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/MiniGPT4-v2.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/PickScore-v1.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/Prometheus-Vision-13b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/Prometheus-Vision-7b.json
RENAMED
File without changes
|
evals/{mjbench β mjbench-results}/overall-results/Qwen-VL-Chat.json
RENAMED
File without changes
|
evals/mjbench/latex_reults/alignment_narrative.tex
DELETED
@@ -1,37 +0,0 @@
|
|
1 |
-
\begin{table}[h]
|
2 |
-
\centering
|
3 |
-
\caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
|
4 |
-
\resizebox{0.9\linewidth}{!}{%
|
5 |
-
\begin{tabular}{c|cccccc}
|
6 |
-
\toprule
|
7 |
-
& Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\
|
8 |
-
\midrule
|
9 |
-
% CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
10 |
-
% BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
11 |
-
% PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
12 |
-
% HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
13 |
-
% ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
14 |
-
% Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
15 |
-
% \midrule
|
16 |
-
LLaVA-1.5-7b$^\heartsuit$ & $19.1$ & $17.8$ & $20.5$ & $16.9$ & $25.0$ & \cellcolor{skyblue} $19.2$ \\
|
17 |
-
LLaVA-1.5-13b$^\heartsuit$ & $22.7$ & $21.3$ & $22.2$ & $15.6
|
18 |
-
$ & $17.9$ & \cellcolor{skyblue} $21.1$ \\
|
19 |
-
LLaVA-NeXT-mistral-7b$^\heartsuit$ & $19.1$ & $17.8$ & $16.2$ & $10.4$ & $12.5$ & \cellcolor{skyblue} $16.8$ \\
|
20 |
-
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $22.7$ & $21.3$ & $17.1$ & $20.8$ & $16.1$ & \cellcolor{skyblue} $20.7$ \\
|
21 |
-
Instructblip-7b$^\heartsuit$ & $22.3$ & $20.9$ & $17.1
|
22 |
-
$ & $15.6$ & $7.10$ & \cellcolor{skyblue} $19.2$ \\
|
23 |
-
MiniGPT4-v2$^\heartsuit$ & $21.1$ & $27.0$ & $22.2$ & $23.4$ & $23.2$ & \cellcolor{skyblue} $23.5$ \\
|
24 |
-
Prometheus-Vision-7b$^\heartsuit$ & $21.9$ & $17.4$ & $21.4$ & $18.2$ & $5.40$ & \cellcolor{skyblue} $18.7$ \\
|
25 |
-
Prometheus-Vision-13b$^\heartsuit$ & $15.1$ & $13.9$ & $12.8$ & $11.5$ & $5.40$ & \cellcolor{skyblue} $13.3$ \\
|
26 |
-
Qwen-VL-Chat$^\spadesuit$ & $22.7$ & $22.6$ & $22.2$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $22.7$ \\
|
27 |
-
Internvl-chat-v1-5$^\spadesuit$ & $19.9$ & $17.8$ & $20.5$ & $20.8$ & $26.8$ & \cellcolor{skyblue} $20.0$ \\
|
28 |
-
Idefics2-8b$^\spadesuit$ & $27.9$ & $24.8$ & $26.5$ & $27.3$ & $28.6$ & \cellcolor{skyblue} $26.7$ \\
|
29 |
-
\midrule
|
30 |
-
GPT-4-vision$^\clubsuit$ & $46.3$ & $\bf 49.7$ & $39.7$ & $48.6$ & $\bf 50.7$ & \cellcolor{skyblue} $43.$1 \\
|
31 |
-
GPT-4o$^\clubsuit$ & $\bf 46.6$ & $45.5$ & $\bf 41.9$ & $\bf 53.0$ & $50.0$ & \cellcolor{skyblue} $\bf 47.2$ \\
|
32 |
-
Gemini Ultra$^\clubsuit$ & $27.9$ & $29.4$ & $20.2$ & $35.7$ & $29.5$ & \cellcolor{skyblue} $31.9$ \\
|
33 |
-
Claude 3 Opus$^\clubsuit$ & $28.8$ & $26.3$ & $22.6$ & $35.7$ & $33.0$ & \cellcolor{skyblue} $29.8$ \\
|
34 |
-
\bottomrule
|
35 |
-
\end{tabular}}
|
36 |
-
\label{exp:alignment_narrative_5}
|
37 |
-
\end{table}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench/latex_reults/alignment_number_10.tex
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
|
2 |
-
\begin{table}[h]
|
3 |
-
\centering
|
4 |
-
\caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
|
5 |
-
\resizebox{0.9\linewidth}{!}{%
|
6 |
-
\begin{tabular}{c|cccccc}
|
7 |
-
\toprule
|
8 |
-
& Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\
|
9 |
-
\midrule
|
10 |
-
LLaVA-1.5-7b$^\heartsuit$ & $20.7$ & $25.2$ & $23.1$ & $18.2$ & $17.9$ & \cellcolor{skyblue} $22.0$ \\
|
11 |
-
LLaVA-1.5-13b$^\heartsuit$ & $17.7$ & $13.5$ & $11.8$ & $16.5$ & $8.9$ & \cellcolor{skyblue} $10.3$ \\
|
12 |
-
LLaVA-NeXT-mistral-7b$^\heartsuit$ & $25.9$ & $30.0$ & $41.9$ & $33.8$ & $35.7$ & \cellcolor{skyblue} $31.3$ \\
|
13 |
-
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $25.9$ & $27.4$ & $31.6$ & $38.9$ & $32.1$ & \cellcolor{skyblue} $29.1$ \\
|
14 |
-
Instructblip-7b$^\heartsuit$ & $17.1$ & $17.4$ & $16.2$ & $13.1$ & $21.4$ & \cellcolor{skyblue} $17.1$ \\
|
15 |
-
MiniGPT4-v2$^\heartsuit$ & $37.5$ & $30.9$ & $30.8$ & $32.5$ & $39.3$ & \cellcolor{skyblue} $32.8$ \\
|
16 |
-
Prometheus-Vision-7b$^\heartsuit$ & $19.5$ & $15.2$ & $16.2$ & $22.1$ & $26.8$ & \cellcolor{skyblue} $18.8$ \\
|
17 |
-
Prometheus-Vision-13b$^\heartsuit$ & $14.3$ & $10.9$ & $9.4$ & $11.7$ & $16.1$ & \cellcolor{skyblue} $11.8$ \\
|
18 |
-
Qwen-VL-Chat$^\spadesuit$ & $30.7$ & $29.1$ & $35.9$ & $29.9$ & $32.1$ & \cellcolor{skyblue} $31.1$ \\
|
19 |
-
Internvl-chat-v1-5$^\spadesuit$ & $\bf 73.3$ & $\bf 74.8$ & $\bf 78.6$ & $\bf 80.5$ & $\bf 78.6$ & \cellcolor{skyblue} $\bf 75.8$ \\
|
20 |
-
Idefics2-8b$^\spadesuit$ & $35.5$ & $31.7$ & $30.8$ & $29.9$ & $30.4$ & \cellcolor{skyblue} $32.6$ \\
|
21 |
-
\midrule
|
22 |
-
GPT-4-vision$^\clubsuit$ & $68.1$ & $62.9$ & $64.1$ & $67.1$ & $73.2$ & \cellcolor{skyblue} $66.1$ \\
|
23 |
-
GPT-4o$^\clubsuit$ & $62.2$ & $57.2$ & $64.1$ & $63.2$ & $67.9$ & \cellcolor{skyblue} $61.5$ \\
|
24 |
-
Gemini Ultra$^\clubsuit$ & $71.7$ & $65.1$ & $63.2$ & $64.5$ & $67.8$ & \cellcolor{skyblue} $67.2$ \\
|
25 |
-
Claude 3 Opus$^\clubsuit$ & $64.9$ & $38.9$ & $44.4$ & $55.3$ & $55.4$ & \cellcolor{skyblue} $57.1$ \\
|
26 |
-
\bottomrule
|
27 |
-
\end{tabular}}
|
28 |
-
\label{exp:alignment_number_10}
|
29 |
-
\end{table}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench/latex_reults/alignment_number_5.tex
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
\begin{table}[h]
|
2 |
-
\centering
|
3 |
-
\caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback is provided in the numerical scale of range [0, 5]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.}
|
4 |
-
\resizebox{0.9\linewidth}{!}{%
|
5 |
-
\begin{tabular}{c|cccccc}
|
6 |
-
\toprule
|
7 |
-
& Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\
|
8 |
-
\midrule
|
9 |
-
% CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
10 |
-
% BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
11 |
-
% PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
12 |
-
% HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
13 |
-
% ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
14 |
-
% Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\
|
15 |
-
% \midrule
|
16 |
-
LLaVA-1.5-7b$^\heartsuit$ & 27.1 & 25.7 & 28.2 & 26.0 & 26.8 & \cellcolor{skyblue} 26.8 \\
|
17 |
-
LLaVA-1.5-13b$^\heartsuit$ & 11.2 & 14.5 & 12.8 & 7.80 & 14.3 & \cellcolor{skyblue} 12.1 \\
|
18 |
-
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 27.9 & 28.3 & 29.1 & 24.7 & 25.0 & \cellcolor{skyblue} 27.0 \\
|
19 |
-
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 28.7 & 21.3 & 31.6 & 28.6 & 26.8 & \cellcolor{skyblue} 27.4 \\
|
20 |
-
Instructblip-7b$^\heartsuit$ & 19.9 & 20.9 & 25.6 & 18.2 & 19.6 & \cellcolor{skyblue} 20.8 \\
|
21 |
-
MiniGPT4-v2$^\heartsuit$ & 27.5 & 26.1 & 32.5 & 37.7 & 26.8 & \cellcolor{skyblue} 30.1 \\
|
22 |
-
Prometheus-Vision-7b$^\heartsuit$ & 18.7 & 13.5 & 14.5 & 19.5 & 25.0 & \cellcolor{skyblue} 18.2 \\
|
23 |
-
Prometheus-Vision-13b$^\heartsuit$ & 12.4 & 11.3 & 9.4 & 11.7 & 12.5 & \cellcolor{skyblue} 11.5 \\
|
24 |
-
Qwen-VL-Chat$^\spadesuit$ & 30.3 & 34.8 & 39.3 & 40.3 & 35.7 & \cellcolor{skyblue} 36.1 \\
|
25 |
-
Internvl-chat-v1-5$^\spadesuit$ & 24.7 & 28.7 & 25.6 & 29.9 & 37.5 & \cellcolor{skyblue} 29.3 \\
|
26 |
-
Idefics2-8b$^\spadesuit$ & 17.1 & 17.0 & 13.5 & 14.3 & 19.6 & \cellcolor{skyblue} 16.3 \\
|
27 |
-
\midrule
|
28 |
-
GPT-4-vision$^\clubsuit$ & \bf 45.3 & \bf 46.3 & 41.3 & 48.3 & 48.3 & \cellcolor{skyblue} 45.9 \\
|
29 |
-
GPT-4o$^\clubsuit$ & 44.2 & 45.3 & \bf 43.3 & \bf 53.4 & \bf 51.3 & \cellcolor{skyblue} \bf 48.6 \\
|
30 |
-
Gemini Ultra$^\clubsuit$ & 31.7 & 29.7 & 23.7 & 39.7 & 32.7 & \cellcolor{skyblue} 29.9 \\
|
31 |
-
Claude 3 Opus$^\clubsuit$ & 24.9 & 28.9 & 25.9 & 31.2 & 29.2 & \cellcolor{skyblue} 26.3 \\
|
32 |
-
\bottomrule
|
33 |
-
\end{tabular}}
|
34 |
-
\label{exp:alignment_number_5}
|
35 |
-
\end{table}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench/latex_reults/artifact_narrative.tex
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
\begin{table}[h]
|
2 |
-
\centering
|
3 |
-
\caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback is provided in the following Likert scale: [\textit{Extremely Poor}, \textit{Poor}, \textit{Average}, \textit{Good}, \textit{Outstanding}]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
|
4 |
-
\resizebox{1.0\linewidth}{!}{%
|
5 |
-
\begin{tabular}{c|cccc|ccc}
|
6 |
-
\toprule
|
7 |
-
& \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
|
8 |
-
& Human Face & Human Limb & Object & \cellcolor{skyblue}Avg & Defocused & Motion & \cellcolor{skyblue}Avg \\
|
9 |
-
\midrule
|
10 |
-
LLaVA-1.5-7b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 1.80 & 10.6 & \cellcolor{skyblue} 6.50 \\
|
11 |
-
LLaVA-1.5-13b$^\heartsuit$ & 0.00 & 0.00 & 0.00 & \cellcolor{skyblue} 0.00 & 18.7 & 29.7 & \cellcolor{skyblue} 24.9 \\
|
12 |
-
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 10.8 & 14.2 & 1.30 & \cellcolor{skyblue} 9.10 & 56.7 & 73.0 & \cellcolor{skyblue} 61.3 \\
|
13 |
-
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 19.6 & 14.3 & 13.9 & \cellcolor{skyblue} 16.8 & 25.8 & 27.3 & \cellcolor{skyblue} 26.6 \\
|
14 |
-
Instructblip-7b$^\heartsuit$ & 9.80 & 3.00 & 18.7 & \cellcolor{skyblue} 10.9 & 9.80 & 9.90 & \cellcolor{skyblue} 9.50 \\
|
15 |
-
Prometheus-Vision-7b$^\heartsuit$ & 19.8 & 15.6 & 12.2 & \cellcolor{skyblue} 16.0 & 26.0 & 29.2 & \cellcolor{skyblue} 27.2 \\
|
16 |
-
Prometheus-Vision-13b$^\heartsuit$ & 7.40 & 5.10 & 7.30 & \cellcolor{skyblue} 6.80 & 9.40 & 11.7 & \cellcolor{skyblue} 11.1 \\
|
17 |
-
Qwen-VL-Chat$^\spadesuit$ & 25.2 & 21.6 & 6.70 & \cellcolor{skyblue} 17.4 & 18.8 & 20.1 & \cellcolor{skyblue} 19.3 \\
|
18 |
-
Internvl-chat-v1-5$^\spadesuit$ & 22.1 & 24.2 & 1.20 &\cellcolor{skyblue} 16.0 & \bf 94.2 & 96.1 & \cellcolor{skyblue} \bf 95.3 \\
|
19 |
-
Idefics2-8b$^\spadesuit$ & 40.9 & 29.6 & 10.1 & \cellcolor{skyblue} 27.0 & 90.2 & 67.5 & \cellcolor{skyblue} 79.2 \\
|
20 |
-
\midrule
|
21 |
-
GPT-4-vision$^\clubsuit$ & 86.9 & 54.4 & 78.7 & \cellcolor{skyblue} 71.5 & 90.6 & \bf 93.5 & \cellcolor{skyblue} 93.6 \\
|
22 |
-
GPT-4o$^\clubsuit$ & \bf 98.2 & \bf 71.1 & \bf 89.9 & \cellcolor{skyblue} \bf 83.6 & 91.8 & 96.1 & \cellcolor{skyblue} 91.6 \\
|
23 |
-
Gemini Ultra$^\clubsuit$ & 71.3 & 30.5 & 59.2 & \cellcolor{skyblue} 48.8 & 80.6 & 90.9 & \cellcolor{skyblue} 79.5 \\
|
24 |
-
Claude 3 Opus$^\clubsuit$ & 21.3 & 17.2 & 9.50 & \cellcolor{skyblue} 14.0 & 85.9 & 93.1 & \cellcolor{skyblue} 83.7 \\
|
25 |
-
\bottomrule
|
26 |
-
\end{tabular}%
|
27 |
-
}
|
28 |
-
\label{exp:artifact_result_narrative_5}
|
29 |
-
\end{table}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/mjbench/latex_reults/artifact_number_10.tex
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
|
2 |
-
\begin{table}[h]
|
3 |
-
\centering
|
4 |
-
\caption{The detailed evaluation result of all multimodal judges on \textbf{quality} perspective. The feedback are provided in numerical scale of range [0, 10]. Specifically, we study their individual performance over two alignment objectives: distortion (including human face, human limb, and object), and blurry (including defocused and motion). The best performance across all models is bolded.}
|
5 |
-
\resizebox{1.0\linewidth}{!}{%
|
6 |
-
\begin{tabular}{c|cccc|ccc}
|
7 |
-
\toprule
|
8 |
-
& \multicolumn{4}{c}{\bf Distortion} & \multicolumn{3}{c}{\bf Blurry} \\
|
9 |
-
& Human Face & Human Limb & Object & \cellcolor{skyblue}Avg & Defocused & Motion & \cellcolor{skyblue}Avg \\
|
10 |
-
\midrule
|
11 |
-
CLIP-v1$^\diamondsuit$ & $26.6$ & $17.2$ & $34.0$ & \cellcolor{skyblue} $19.3$ & $50.6$ & $63.7$ & \cellcolor{skyblue} $56.7$ \\
|
12 |
-
BLIP-v2$^\diamondsuit$ & $3.60$ & $2.00$ & $1.10$ & \cellcolor{skyblue} $1.90$ & $8.30$ & $47.2$ & \cellcolor{skyblue} $15.0$ \\
|
13 |
-
PickScore-v1$^\diamondsuit$ & $83.4$ & $68.2$ & $92.1$ & \cellcolor{skyblue} $79.3$ & $80.6$ & $93.4$ & \cellcolor{skyblue} $86.6$ \\
|
14 |
-
HPS-v2.1$^\diamondsuit$ & $60.4$ & $37.1$ & $80.3$ & \cellcolor{skyblue} $51.7$ & $85.7$ & $94.6$ & \cellcolor{skyblue} $88.6$ \\
|
15 |
-
ImageReward$^\diamondsuit$ & $31.4$ & $34.4$ & $40.2$ & \cellcolor{skyblue} $33.3$ & $77.4$ & $86.6$ & \cellcolor{skyblue} $82.1$ \\
|
16 |
-
Aesthetics$^\diamondsuit$ & $78.7$ & $57.1$ & $51.3$ & \cellcolor{skyblue} $52.1$ & $90.1$ & $93.4$ & \cellcolor{skyblue} $91.6$ \\
|
17 |
-
\midrule
|
18 |
-
LLaVA-1.5-7b$^\heartsuit$ & $13.6$ & $7.30$ & $9.20$ & \cellcolor{skyblue} $10.2$ & $7.10$ & $19.1$ & \cellcolor{skyblue} $13.1$ \\
|
19 |
-
LLaVA-1.5-13b$^\heartsuit$ & $20.1$ & $14.6$ & $13.3$ & \cellcolor{skyblue} $16.4$ & $18.0$ & $34.0$ & \cellcolor{skyblue} $26.1$ \\
|
20 |
-
LLaVA-NeXT-7b$^\heartsuit$ & $28.4$ & $27.8$ & $19.0$ & \cellcolor{skyblue} $30.1$ & $41.7$ & $66.1$ & \cellcolor{skyblue} $53.9$ \\
|
21 |
-
LLaVA-NeXT-13b$^\heartsuit$ & $18.9$ & $27.8$ & $12.0$ & \cellcolor{skyblue} $20.5$ & $40.6$ & $45.4$ & \cellcolor{skyblue} $43.0$ \\
|
22 |
-
Instructblip-7b$^\heartsuit$ & $12.4$ & $9.30$ & $21.0$ & \cellcolor{skyblue} $13.3$ & $32.3$ & $31.1$ & \cellcolor{skyblue} $31.7$ \\
|
23 |
-
MiniGPT4-v2$^\heartsuit$ & $39.6$ & $39.1$ & $42.0$ & \cellcolor{skyblue} $40.0$ & $33.4$ & $37.4$ & \cellcolor{skyblue} $35.4$ \\
|
24 |
-
Prometheus-Vision-7b$^\heartsuit$ & $16.6$ & $17.9$ & $14.1$ & \cellcolor{skyblue} $16.4$ & $22.3$ & $30.3$ & \cellcolor{skyblue} $26.3$ \\
|
25 |
-
Prometheus-Vision-13b$^\heartsuit$ & $7.10$ & $4.60$ & $7.20$ & \cellcolor{skyblue} $6.20$ & $9.40$ &$10.6$ & \cellcolor{skyblue} $10.0$ \\
|
26 |
-
Qwen-VL-Chat$^\spadesuit$ & $14.2$ & $15.9$ & $9.40$ & \cellcolor{skyblue} $13.6$ & $0.90$ & $2.10$ & \cellcolor{skyblue} $1.40$ \\
|
27 |
-
Internvl-chat-v1-5$^\spadesuit$ & $97.0$ & $\bf 95.4$ & $97.1$ & \cellcolor{skyblue} $\bf 97.1$ & $89.7$ & $89.7$ & \cellcolor{skyblue} $89.7$ \\
|
28 |
-
Idefics2-8b$^\spadesuit$ & $29.6$ & $25.8$ & $2.30$ & \cellcolor{skyblue} $21.7$ & $70.6$ & $46.9$ & \cellcolor{skyblue} $58.7$ \\
|
29 |
-
\midrule
|
30 |
-
GPT-4-vision$^\clubsuit$ & $87.6$ & $57.6$ & $83.1$ & \cellcolor{skyblue} $75.7$ & $98.8$ & $99.3$ & \cellcolor{skyblue} $99.2$ \\
|
31 |
-
GPT-4o$^\clubsuit$ & $\bf 99.4$ & $78.2$ & $\bf 100$ & \cellcolor{skyblue} $93.8$ & $\bf 100$ & $\bf 100$ & \cellcolor{skyblue} $\bf 100$ \\
|
32 |
-
Gemini Ultra$^\clubsuit$ & $73.4$ & $32.5$ & $61.0$ & \cellcolor{skyblue} $55.7$ & $86.5$ & $97.3$ & \cellcolor{skyblue} $93.9$ \\
|
33 |
-
Claude 3 Opus$^\clubsuit$ & $26.6$ & $19.3$ & $10.7$ & \cellcolor{skyblue} $17.6$ & $89.6$ & $93.3$ & \cellcolor{skyblue} $92.7$ \\
|
34 |
-
\bottomrule
|
35 |
-
\end{tabular}%
|
36 |
-
}
|
37 |
-
\label{exp:artifact_result_number_10}
|
38 |
-
\end{table}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|