Shanshan Wang commited on
Commit
9581bcd
1 Parent(s): 50fb3ca

updated multimodal benchmarks

Browse files
Files changed (2) hide show
  1. app.py +64 -9
  2. filtered_opencompass.csv +16 -0
app.py CHANGED
@@ -4,8 +4,10 @@ import gradio as gr
4
 
5
 
6
  data_path = '0926-OCRBench-opensource.csv'
 
7
  data = pd.read_csv(data_path).fillna(0)
8
 
 
9
  # set the data types for the columns
10
  dtype_dict = {
11
  "Model": str,
@@ -69,18 +71,71 @@ def plot_metric(selected_metric):
69
 
70
  return fig
71
 
72
- # Gradio Blocks Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def create_interface():
74
  with gr.Blocks() as interface:
75
- with gr.Row():
76
- with gr.Column(scale=4): # Column for the plot (takes 4 parts of the total space)
77
- plot = gr.Plot(value=plot_metric("OCRBench"), label="OCR Benchmark Metrics") # default plot component initially
78
- with gr.Column(scale=1): # Column for the dropdown (takes 1 part of the total space)
79
- metrics = list(data_valid.columns[5:-1]) # List of metric columns (excluding 'Model' and 'Parameter Size')
80
- dropdown = gr.Dropdown(metrics, label="Select Metric", value="OCRBench")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- # Update the plot when dropdown selection changes
83
- dropdown.change(fn=plot_metric, inputs=dropdown, outputs=plot)
84
 
85
  return interface
86
 
 
4
 
5
 
6
  data_path = '0926-OCRBench-opensource.csv'
7
+ data_mmlm_path = 'filtered_opencompass.csv'
8
  data = pd.read_csv(data_path).fillna(0)
9
 
10
+ ######## OCRBench ########
11
  # set the data types for the columns
12
  dtype_dict = {
13
  "Model": str,
 
71
 
72
  return fig
73
 
74
+
75
+ ####### OpenCompass ########
76
+ data_mmlm = pd.read_csv(data_mmlm_path).fillna(0)
77
+ data_mmlm.rename(columns={"Avg. Score (8 single image benchmarks)": "Average Score"}, inplace=True)
78
+ metrics_column = list(data_mmlm.columns)[6:]
79
+
80
+ def plot_metric_mmlm_grouped(category):
81
+ # Filter the data based on the selected category
82
+ filtered_data = data_mmlm[data_mmlm["Category"] == category].copy()
83
+
84
+ # Melt the dataframe to have a "Metric" column and a "Score" column
85
+ melted_data = pd.melt(
86
+ filtered_data,
87
+ id_vars=["Models"], # Keep the Model column as identifier
88
+ value_vars=metrics_column, # Melt all the metric columns
89
+ var_name="Metrics", # Name for the new column containing metrics
90
+ value_name="Score" # Name for the new column containing scores
91
+ )
92
+
93
+ # Generate a grouped bar chart
94
+ fig = px.bar(
95
+ melted_data,
96
+ x="Metrics",
97
+ y="Score",
98
+ color="Models", # Differentiate metrics by color
99
+ barmode="group", # Grouped bars
100
+ title=f"Scores for All Metrics in {category} Category"
101
+ )
102
+
103
+ fig.update_layout(
104
+ xaxis_title="Metrics",
105
+ yaxis_title="Score",
106
+ height=600,
107
+ margin=dict(t=50, l=50, r=100, b=50),
108
+ )
109
+ return fig
110
+
111
+
112
+ # Gradio Blocks Interface with Tabs
113
  def create_interface():
114
  with gr.Blocks() as interface:
115
+ with gr.Tabs():
116
+ with gr.Tab("OCRBench"):
117
+ with gr.Row():
118
+ with gr.Column(scale=4): # Column for the plot (takes 4 parts of the total space)
119
+ plot = gr.Plot(value=plot_metric("OCRBench"), label="OCR Benchmark Metrics") # default plot component initially
120
+ with gr.Column(scale=1): # Column for the dropdown (takes 1 part of the total space)
121
+ metrics = list(data_valid.columns[5:-1]) # List of metric columns (excluding 'Model' and 'Parameter Size')
122
+ dropdown = gr.Dropdown(metrics, label="Select Metric", value="OCRBench")
123
+
124
+ # Update the plot when dropdown selection changes
125
+ dropdown.change(fn=plot_metric, inputs=dropdown, outputs=plot)
126
+
127
+
128
+ with gr.Tab("8 Multi-modal Benchmarks"):
129
+ with gr.Row():
130
+ # Dropdown for selecting the category
131
+ categories = data_mmlm["Category"].unique().tolist()
132
+ category_dropdown = gr.Dropdown(categories, label="Select Category", value=categories[0])
133
+
134
+ with gr.Row():
135
+ mm_plot = gr.Plot(value=plot_metric_mmlm_grouped(categories[0]), label="Grouped Metrics for Models")
136
 
137
+ # Update the plot based on category dropdown changes
138
+ category_dropdown.change(fn=plot_metric_mmlm_grouped, inputs=category_dropdown, outputs=mm_plot)
139
 
140
  return interface
141
 
filtered_opencompass.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Category,Models,Type,Params (B),Language Model,Vision Model,Avg. Score (8 single image benchmarks),MMBench V1.1_TEST,MMStar,MMMU,Math Vista,Hallusion Bench Avg,AI2D_TEST,OCR Bench,MMVet
2
+ Similar score models,Qwen2-VL-2B,Open,2.1,Qwen2-1.5B,ViT-600M,57.3,72.2,47.5,42.2,47.8,42.4,74.7,79.7,51.5
3
+ Similar score models,H2O-Mississippi-2B,Open,2.1,Danube2 1.8B,InternViT-300M,54.5,64.8,49.6,35.2,56.8,36.4,69.9,78.2,44.7
4
+ Similar score models,InternVL2-2B,Open,2.1,InternLM2-1.8B,InternViT-300M,54.0,69.6,49.8,36.3,46.0,38.0,74.1,78.1,39.7
5
+ Similar score models,Phi-3-Vision - Microsoft,Open,4.2,Phi-3,CLIP ViT-L/14,53.6,65.2,47.7,46.1,44.6,39.0,78.4,63.7,44.1
6
+ Similar score models,Claude3-Opus - Anthropic,Closed,Unknown,,,54.4,59.1,45.7,54.9,45.8,37.8,70.6,69.4,51.7
7
+ Similar score models,Claude3-Sonnet- Anthropic,Closed,Unknown,,,53.5,63.9,44.2,47.4,45.0,41.3,69.9,64.6,51.7
8
+ Similar score models,Cambrian-13B,Open,13,Vicuna-v1.5-13B,CLIP ViT-L/14,53.3,67.5,47.1,41.6,47.4,39.4,73.6,61.0,48.9
9
+ Similar score models,Qwen-VL-Plus - Alibaba,Closed,Unknown,,,52.2,66.2,39.7,39.8,37.6,40.6,65.7,72.6,55.7
10
+ Similar size models,Qwen2-VL-2B,Open,2.1,Qwen2-1.5B,ViT-600M,57.3,72.2,47.5,42.2,47.8,42.4,74.7,79.7,51.5
11
+ Similar size models,H2O-Mississippi-2B,Open,2.1,Danube2 1.8B,InternViT-300M,54.5,64.8,49.6,35.2,56.8,36.4,69.9,78.2,44.7
12
+ Similar size models,InternVL2-2B,Open,2.1,InternLM2-1.8B,InternViT-300M,54.0,69.6,49.8,36.3,46.0,38.0,74.1,78.1,39.7
13
+ Similar size models,Phi-3-Vision - Microsoft,Open,4.2,Phi-3,CLIP ViT-L/14,53.6,65.2,47.7,46.1,44.6,39.0,78.4,63.7,44.1
14
+ Similar size models,MiniCPM-V-2 ,Open,2.8,MiniCPM-2.4B,SigLip-400M,47.9,65.8,39.1,38.2,39.8,36.1,62.9,60.5,41.0
15
+ Similar size models,PaliGemma-3B-mix-448 ,Open,3,Gemma-2B,SigLip-400M,46.6,65.6,48.3,34.9,28.7,32.2,68.3,61.4,33.1
16
+ Similar size models,DeepSeek-VL-1.3B ,Open,2,DeekSeek-1B,SAM-B & SigLIP-L,39.6,63.8,39.9,33.8,29.8,27.6,51.5,41.3,29.2