Spaces:
Running
Running
\begin{table}[h] | |
\centering | |
\caption{The detailed evaluation result of all multimodal judges on \textbf{alignment} perspective. The feedback is provided in the numerical scale of range [0, 5]. Specifically, we study their individual performance over five alignment objectives: object (existence), attribute, action, location, and count. The best performance across all models is bolded.} | |
\resizebox{0.9\linewidth}{!}{% | |
\begin{tabular}{c|cccccc} | |
\toprule | |
& Object & Attribute & Action & Location & Count & \cellcolor{skyblue}Avg \\ | |
\midrule | |
% CLIP-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ | |
% BLIP-v2$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ | |
% PickScore-v1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ | |
% HPS-v2.1$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ | |
% ImageReward$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ | |
% Aesthetics$^\diamondsuit$ & - & - & - & - & - & \cellcolor{skyblue} - \\ | |
% \midrule | |
LLaVA-1.5-7b$^\heartsuit$ & 27.1 & 25.7 & 28.2 & 26.0 & 26.8 & \cellcolor{skyblue} 26.8 \\ | |
LLaVA-1.5-13b$^\heartsuit$ & 11.2 & 14.5 & 12.8 & 7.80 & 14.3 & \cellcolor{skyblue} 12.1 \\ | |
LLaVA-NeXT-mistral-7b$^\heartsuit$ & 27.9 & 28.3 & 29.1 & 24.7 & 25.0 & \cellcolor{skyblue} 27.0 \\ | |
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & 28.7 & 21.3 & 31.6 & 28.6 & 26.8 & \cellcolor{skyblue} 27.4 \\ | |
Instructblip-7b$^\heartsuit$ & 19.9 & 20.9 & 25.6 & 18.2 & 19.6 & \cellcolor{skyblue} 20.8 \\ | |
MiniGPT4-v2$^\heartsuit$ & 27.5 & 26.1 & 32.5 & 37.7 & 26.8 & \cellcolor{skyblue} 30.1 \\ | |
Prometheus-Vision-7b$^\heartsuit$ & 18.7 & 13.5 & 14.5 & 19.5 & 25.0 & \cellcolor{skyblue} 18.2 \\ | |
Prometheus-Vision-13b$^\heartsuit$ & 12.4 & 11.3 & 9.4 & 11.7 & 12.5 & \cellcolor{skyblue} 11.5 \\ | |
Qwen-VL-Chat$^\spadesuit$ & 30.3 & 34.8 & 39.3 & 40.3 & 35.7 & \cellcolor{skyblue} 36.1 \\ | |
Internvl-chat-v1-5$^\spadesuit$ & 24.7 & 28.7 & 25.6 & 29.9 & 37.5 & \cellcolor{skyblue} 29.3 \\ | |
Idefics2-8b$^\spadesuit$ & 17.1 & 17.0 & 13.5 & 14.3 & 19.6 & \cellcolor{skyblue} 16.3 \\ | |
\midrule | |
GPT-4-vision$^\clubsuit$ & \bf 45.3 & \bf 46.3 & 41.3 & 48.3 & 48.3 & \cellcolor{skyblue} 45.9 \\ | |
GPT-4o$^\clubsuit$ & 44.2 & 45.3 & \bf 43.3 & \bf 53.4 & \bf 51.3 & \cellcolor{skyblue} \bf 48.6 \\ | |
Gemini Ultra$^\clubsuit$ & 31.7 & 29.7 & 23.7 & 39.7 & 32.7 & \cellcolor{skyblue} 29.9 \\ | |
Claude 3 Opus$^\clubsuit$ & 24.9 & 28.9 & 25.9 & 31.2 & 29.2 & \cellcolor{skyblue} 26.3 \\ | |
\bottomrule | |
\end{tabular}} | |
\label{exp:alignment_number_5} | |
\end{table} | |