Spaces:
Running
Running
\begin{table}[t] | |
\centering | |
\small | |
\caption{Performance comparison of multimodal judges w.r.t. different ranges of numerical scale and likert range. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], [0, 100]. The best performance across all models is bolded.} | |
\resizebox{0.7\linewidth}{!}{% | |
\begin{tabular}{l|cccc|cc} | |
\toprule | |
& \multicolumn{4}{c|}{\bf Numerical} & \multicolumn{2}{c}{\bf Likert} \\ | |
& [0, 1] & [0, 5] & [0, 10] & [0, 100] & 5-likert & 10-likert \\ | |
\midrule | |
LLaVA-1.5-7b$^\heartsuit$ & $15.0$ & $26.7$ & $22.0$ & $18.3$ & $ 5.3$ & $10.3$ \\ | |
LLaVA-1.5-13b$^\heartsuit$ & $ 9.7$ & $12.0$ & $10.3$ & $20.5$ & $ 2.6$ & $ 6.8$ \\ | |
LLaVA-NeXT-mistral-7b$^\heartsuit$ & $20.8$ & $27.1$ & $31.3$ & $29.3$ & $36.0$ & $38.6$ \\ | |
LLaVA-NeXT-vicuna-13b$^\heartsuit$ & $18.3$ & $26.7$ & $29.1$ & $17.2$ & $28.7$ & $17.2$ \\ | |
Instructblip-7b$^\heartsuit$ & $15.0$ & $20.9$ & $17.1$ & $17.6$ & $11.9$ & $16.8$ \\ | |
MiniGPT4-v2$^\heartsuit$ & $20.4$ & $28.9$ & $32.8$ & $20.9$ & $16.0$ & $28.7$ \\ | |
Prometheus-Vision-7b$^\heartsuit$ & $3.8 $ & $16.7$ & $18.4$ & $15.7$ & $28.7$ & $31.3$ \\ | |
Prometheus-Vision-13b$^\heartsuit$ & $19.7$ & $11.5$ & $11.8$ & $11.2$ & $11.0$ & $6.9$ \\ | |
\midrule | |
Qwen-VL-Chat$^\spadesuit$ & $26.7$ & $34.6$ & $31.1$ & $26.9$ & $55.5$ & $30.6$ \\ | |
Internvl-chat-v1-5$^\spadesuit$ & $33.0$ & $27.6$ & $75.8$ & $35.3$ & $73.3$ & $18.9$ \\ | |
Idefics2-8b$^\spadesuit$ & $14.6$ & $16.6$ & $32.6$ & $32.6$ & $41.2$ & $25.6$ \\ | |
\midrule | |
GPT-4-vision$^\clubsuit$ & $63.2$ & $61.2$ & $66.1$ & \bf 67.2 & $\textbf{60.2}$ & $\textbf{63.0}$ \\ | |
GPT-4o$^\clubsuit$ & \bf 63.9 & $61.3$ & $61.5$ & $62.8$ & $56.3$ & $60.3$ \\ | |
Gemini Ultra$^\clubsuit$ & $59.3$ & $\textbf{67.3}$ & \bf 67.2 & $60.1$ & $51.4$ & $57.8$ \\ | |
Claude 3 Opus$^\clubsuit$ & $60.7$ & $45.5$ & $57.1$ & $49.4$ & $56.1$ & $62.4$ \\ | |
\midrule | |
\cellcolor{skyblue} Overall & \cellcolor{skyblue}30.3 & \cellcolor{skyblue}32.3 & \cellcolor{skyblue} 37.6 & \cellcolor{skyblue}32.33 & \cellcolor{skyblue}35.6 & \cellcolor{skyblue}31.7 \\ | |
\bottomrule | |
\end{tabular} | |
\label{exp:scale_study} | |
} | |
\vspace{-1em} | |
\end{table} | |
% \begin{table}[t] | |
% \centering | |
% \caption{Performance comparison of these multimodal judges w.r.t. different ranges of numerical scale. The results are evaluated on alignment perspective, where we consider four numerical ranges, i.e. [0, 1], [0, 5], [0, 10], and [0, 100]. The best performance across all models is bolded.} | |
% \resizebox{0.7\linewidth}{!}{% | |
% \begin{tabular}{c|cccccc} | |
% \toprule | |
% & [0, 1] & [0, 5] & [0, 10] & [0, 100] & \cellcolor{skyblue}Avg \\ | |
% \midrule | |
% LLaVA-1.5-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\ | |
% LLaVA-1.5-13b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\ | |
% LLaVA-NeXT-mistral-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\ | |
% LLaVA-NeXT-vicuna-13b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} \\ | |
% Instructblip-7b$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ | |
% MiniGPT4-v2$^\heartsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ | |
% Qwen-VL-Chat$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\ | |
% Internvl-chat-v1-5$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\ | |
% Idefics2-8b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\ | |
% Prometheus-Vision-13b$^\spadesuit$ & - & - & - & - & \cellcolor{skyblue} - \\ | |
% \midrule | |
% GPT-4-vision$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ | |
% GPT-4o$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ | |
% Gemini Ultra$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ | |
% Claude 3 Opus$^\clubsuit$ & - & - & - & - & \cellcolor{skyblue} - \\ | |
% \bottomrule | |
% \end{tabular}} | |
% \label{exp:scale_study} | |
% \end{table} | |