\begin{table}[htb] \vspace{-5pt} \centering \small \caption{Comparison of open-source judges w.r.t. different input modes. Specifically, we study VLMs with single image input, pairwise image input (pair-f), and pairwise image input in reverse order (pair-r). The best performance is in bold.} \resizebox{0.92\linewidth}{!}{% \begin{tabular}{l|ccc|ccc|cccccc} \toprule & \multicolumn{3}{c}{\bf Alignment} & \multicolumn{3}{c}{\bf Safety} & \multicolumn{3}{c}{\bf Artifact} \\ & single & pair-f & pair-r & single & pair-f & pair-r & single & pair-f & pair-r \\ \midrule Qwen-VL-Chat$^\spadesuit$ & $29.1$ & $31.1$ & $\textbf{73.0}$ & $\textbf{33.5}$ & $6.8$ & $\textbf{60.1}$ & $19.8$ & $5.7$ & $41.5$ \\ Internvl-chat-v1-5$^\spadesuit$ & $\textbf{32.8}$ & $\textbf{75.8}$ & $34.8$ & $20.1$ & $5.9$ & $4.6$ & $38.8$ & $\textbf{91.8}$ & $40.7$ \\ Idefics2-8b$^\spadesuit$ & $30.2$ & $32.6$ & $32.6$ & $27.3$ & $\textbf{13.7}$ & $32.6$ & $\textbf{40.2}$ & $49.0$ & $\textbf{43.2}$ \\ % \midrule % GPT-4-vision$^\clubsuit$ & - & - & - & - & - & - & 80.4 & 93.2 \\ % GPT-4o$^\clubsuit$ & - & - & - & - & - & - & 82.5 & 92.8 \\ % Gemini Ultra$^\clubsuit$ & - & - & - & - & - & - & 75.3 & 88.6 \\ % Claude 3 Opus$^\clubsuit$ & - & - & - & - & - & - & 65.6 & 85.0 \\ \bottomrule \end{tabular}% } \label{exp:judge_consitiency} \end{table}