Shanshan Wang
commited on
Commit
•
9581bcd
1
Parent(s):
50fb3ca
updated multimodal benchmarks
Browse files- app.py +64 -9
- filtered_opencompass.csv +16 -0
app.py
CHANGED
@@ -4,8 +4,10 @@ import gradio as gr
|
|
4 |
|
5 |
|
6 |
data_path = '0926-OCRBench-opensource.csv'
|
|
|
7 |
data = pd.read_csv(data_path).fillna(0)
|
8 |
|
|
|
9 |
# set the data types for the columns
|
10 |
dtype_dict = {
|
11 |
"Model": str,
|
@@ -69,18 +71,71 @@ def plot_metric(selected_metric):
|
|
69 |
|
70 |
return fig
|
71 |
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
def create_interface():
|
74 |
with gr.Blocks() as interface:
|
75 |
-
with gr.
|
76 |
-
with gr.
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
-
|
83 |
-
|
84 |
|
85 |
return interface
|
86 |
|
|
|
4 |
|
5 |
|
6 |
data_path = '0926-OCRBench-opensource.csv'
|
7 |
+
data_mmlm_path = 'filtered_opencompass.csv'
|
8 |
data = pd.read_csv(data_path).fillna(0)
|
9 |
|
10 |
+
######## OCRBench ########
|
11 |
# set the data types for the columns
|
12 |
dtype_dict = {
|
13 |
"Model": str,
|
|
|
71 |
|
72 |
return fig
|
73 |
|
74 |
+
|
75 |
+
####### OpenCompass ########
|
76 |
+
data_mmlm = pd.read_csv(data_mmlm_path).fillna(0)
|
77 |
+
data_mmlm.rename(columns={"Avg. Score (8 single image benchmarks)": "Average Score"}, inplace=True)
|
78 |
+
metrics_column = list(data_mmlm.columns)[6:]
|
79 |
+
|
80 |
+
def plot_metric_mmlm_grouped(category):
|
81 |
+
# Filter the data based on the selected category
|
82 |
+
filtered_data = data_mmlm[data_mmlm["Category"] == category].copy()
|
83 |
+
|
84 |
+
# Melt the dataframe to have a "Metric" column and a "Score" column
|
85 |
+
melted_data = pd.melt(
|
86 |
+
filtered_data,
|
87 |
+
id_vars=["Models"], # Keep the Model column as identifier
|
88 |
+
value_vars=metrics_column, # Melt all the metric columns
|
89 |
+
var_name="Metrics", # Name for the new column containing metrics
|
90 |
+
value_name="Score" # Name for the new column containing scores
|
91 |
+
)
|
92 |
+
|
93 |
+
# Generate a grouped bar chart
|
94 |
+
fig = px.bar(
|
95 |
+
melted_data,
|
96 |
+
x="Metrics",
|
97 |
+
y="Score",
|
98 |
+
color="Models", # Differentiate metrics by color
|
99 |
+
barmode="group", # Grouped bars
|
100 |
+
title=f"Scores for All Metrics in {category} Category"
|
101 |
+
)
|
102 |
+
|
103 |
+
fig.update_layout(
|
104 |
+
xaxis_title="Metrics",
|
105 |
+
yaxis_title="Score",
|
106 |
+
height=600,
|
107 |
+
margin=dict(t=50, l=50, r=100, b=50),
|
108 |
+
)
|
109 |
+
return fig
|
110 |
+
|
111 |
+
|
112 |
+
# Gradio Blocks Interface with Tabs
|
113 |
def create_interface():
|
114 |
with gr.Blocks() as interface:
|
115 |
+
with gr.Tabs():
|
116 |
+
with gr.Tab("OCRBench"):
|
117 |
+
with gr.Row():
|
118 |
+
with gr.Column(scale=4): # Column for the plot (takes 4 parts of the total space)
|
119 |
+
plot = gr.Plot(value=plot_metric("OCRBench"), label="OCR Benchmark Metrics") # default plot component initially
|
120 |
+
with gr.Column(scale=1): # Column for the dropdown (takes 1 part of the total space)
|
121 |
+
metrics = list(data_valid.columns[5:-1]) # List of metric columns (excluding 'Model' and 'Parameter Size')
|
122 |
+
dropdown = gr.Dropdown(metrics, label="Select Metric", value="OCRBench")
|
123 |
+
|
124 |
+
# Update the plot when dropdown selection changes
|
125 |
+
dropdown.change(fn=plot_metric, inputs=dropdown, outputs=plot)
|
126 |
+
|
127 |
+
|
128 |
+
with gr.Tab("8 Multi-modal Benchmarks"):
|
129 |
+
with gr.Row():
|
130 |
+
# Dropdown for selecting the category
|
131 |
+
categories = data_mmlm["Category"].unique().tolist()
|
132 |
+
category_dropdown = gr.Dropdown(categories, label="Select Category", value=categories[0])
|
133 |
+
|
134 |
+
with gr.Row():
|
135 |
+
mm_plot = gr.Plot(value=plot_metric_mmlm_grouped(categories[0]), label="Grouped Metrics for Models")
|
136 |
|
137 |
+
# Update the plot based on category dropdown changes
|
138 |
+
category_dropdown.change(fn=plot_metric_mmlm_grouped, inputs=category_dropdown, outputs=mm_plot)
|
139 |
|
140 |
return interface
|
141 |
|
filtered_opencompass.csv
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Category,Models,Type,Params (B),Language Model,Vision Model,Avg. Score (8 single image benchmarks),MMBench V1.1_TEST,MMStar,MMMU,Math Vista,Hallusion Bench Avg,AI2D_TEST,OCR Bench,MMVet
|
2 |
+
Similar score models,Qwen2-VL-2B,Open,2.1,Qwen2-1.5B,ViT-600M,57.3,72.2,47.5,42.2,47.8,42.4,74.7,79.7,51.5
|
3 |
+
Similar score models,H2O-Mississippi-2B,Open,2.1,Danube2 1.8B,InternViT-300M,54.5,64.8,49.6,35.2,56.8,36.4,69.9,78.2,44.7
|
4 |
+
Similar score models,InternVL2-2B,Open,2.1,InternLM2-1.8B,InternViT-300M,54.0,69.6,49.8,36.3,46.0,38.0,74.1,78.1,39.7
|
5 |
+
Similar score models,Phi-3-Vision - Microsoft,Open,4.2,Phi-3,CLIP ViT-L/14,53.6,65.2,47.7,46.1,44.6,39.0,78.4,63.7,44.1
|
6 |
+
Similar score models,Claude3-Opus - Anthropic,Closed,Unknown,,,54.4,59.1,45.7,54.9,45.8,37.8,70.6,69.4,51.7
|
7 |
+
Similar score models,Claude3-Sonnet- Anthropic,Closed,Unknown,,,53.5,63.9,44.2,47.4,45.0,41.3,69.9,64.6,51.7
|
8 |
+
Similar score models,Cambrian-13B,Open,13,Vicuna-v1.5-13B,CLIP ViT-L/14,53.3,67.5,47.1,41.6,47.4,39.4,73.6,61.0,48.9
|
9 |
+
Similar score models,Qwen-VL-Plus - Alibaba,Closed,Unknown,,,52.2,66.2,39.7,39.8,37.6,40.6,65.7,72.6,55.7
|
10 |
+
Similar size models,Qwen2-VL-2B,Open,2.1,Qwen2-1.5B,ViT-600M,57.3,72.2,47.5,42.2,47.8,42.4,74.7,79.7,51.5
|
11 |
+
Similar size models,H2O-Mississippi-2B,Open,2.1,Danube2 1.8B,InternViT-300M,54.5,64.8,49.6,35.2,56.8,36.4,69.9,78.2,44.7
|
12 |
+
Similar size models,InternVL2-2B,Open,2.1,InternLM2-1.8B,InternViT-300M,54.0,69.6,49.8,36.3,46.0,38.0,74.1,78.1,39.7
|
13 |
+
Similar size models,Phi-3-Vision - Microsoft,Open,4.2,Phi-3,CLIP ViT-L/14,53.6,65.2,47.7,46.1,44.6,39.0,78.4,63.7,44.1
|
14 |
+
Similar size models,MiniCPM-V-2 ,Open,2.8,MiniCPM-2.4B,SigLip-400M,47.9,65.8,39.1,38.2,39.8,36.1,62.9,60.5,41.0
|
15 |
+
Similar size models,PaliGemma-3B-mix-448 ,Open,3,Gemma-2B,SigLip-400M,46.6,65.6,48.3,34.9,28.7,32.2,68.3,61.4,33.1
|
16 |
+
Similar size models,DeepSeek-VL-1.3B ,Open,2,DeekSeek-1B,SAM-B & SigLIP-L,39.6,63.8,39.9,33.8,29.8,27.6,51.5,41.3,29.2
|