\begin{table}[htb]
    \vspace{-5pt}
    \centering
    \small
    \caption{Comparison of open-source judges w.r.t. different input modes. Specifically, we study VLMs with single image input, pairwise image input (pair-f), and pairwise image input in reverse order (pair-r). The best performance is in bold.}
    
    \resizebox{0.92\linewidth}{!}{%
    \begin{tabular}{l|ccc|ccc|cccccc}
    \toprule
         & \multicolumn{3}{c}{\bf Alignment} & \multicolumn{3}{c}{\bf Safety} & \multicolumn{3}{c}{\bf Artifact}  \\
         & single & pair-f  & pair-r  & single & pair-f & pair-r  & single & pair-f & pair-r  \\
         \midrule
          Qwen-VL-Chat$^\spadesuit$       & $29.1$ & $31.1$ & $\textbf{73.0}$ & $\textbf{33.5}$ & $6.8$  & $\textbf{60.1}$  & $19.8$ & $5.7$   & $41.5$  \\
          Internvl-chat-v1-5$^\spadesuit$ & $\textbf{32.8}$ & $\textbf{75.8}$ & $34.8$ & $20.1$ & $5.9$  & $4.6$   & $38.8$ & $\textbf{91.8}$  & $40.7$  \\
          Idefics2-8b$^\spadesuit$        & $30.2$ & $32.6$ & $32.6$ & $27.3$ & $\textbf{13.7}$ & $32.6$  & $\textbf{40.2}$ & $49.0$  & $\textbf{43.2}$  \\
          % \midrule
          % GPT-4-vision$^\clubsuit$        & - & - & - & - & - & - & 80.4 & 93.2  \\
          % GPT-4o$^\clubsuit$              & - & - & - & - & - & - & 82.5 & 92.8 \\
          % Gemini Ultra$^\clubsuit$        & - & - & - & - & - & - & 75.3 & 88.6 \\
          % Claude 3 Opus$^\clubsuit$       & - & - & - & - & - & - & 65.6 & 85.0  \\
    \bottomrule
    \end{tabular}%
    }
    
    \label{exp:judge_consitiency}
\end{table}