Spaces:
Sleeping
Sleeping
File size: 5,953 Bytes
75d681a e8f4283 c1f8e27 e8f4283 0ec0046 e8f4283 3427ab9 e8f4283 3427ab9 e8f4283 3427ab9 e8f4283 3427ab9 e8f4283 3427ab9 1dca33f 3427ab9 e8f4283 3427ab9 e8f4283 0ec0046 e8f4283 a3c4484 16b6bb4 3427ab9 a3c4484 16b6bb4 a3c4484 3427ab9 a101f39 16b6bb4 3427ab9 c8b7025 3427ab9 a101f39 16b6bb4 3427ab9 16b6bb4 3427ab9 16b6bb4 3427ab9 a3c4484 a101f39 3427ab9 05e4334 3427ab9 c8b7025 3427ab9 0ec0046 16b6bb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import os
from glob import glob
import gradio as gr
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.colors import BoundaryNorm, ListedColormap
all_results = pd.read_pickle("final_df.pkl")
def get_accuracy_dataframe(df_mother, category):
# Calculate overall model accuracy
# filter for category only
df = df_mother[df_mother["category"] == category].copy()
df["is_answer_correct"] = df["is_answer_correct"].astype(float)
model_accuracy = df.groupby("model")["is_answer_correct"].mean().reset_index()
# Calculate model accuracy per difficulty level
df["difficulty_level"] = df["difficulty_level"].astype(int)
model_accuracy_per_level = (
df.groupby(["model", "difficulty_level"])["is_answer_correct"]
.mean()
.reset_index()
)
model_accuracy_per_level_df = model_accuracy_per_level.pivot(
index="model", columns="difficulty_level", values="is_answer_correct"
)
# Merge overall accuracy and level-based accuracy into a single DataFrame
model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on="model")
model_accuracy_df.rename(
columns={"is_answer_correct": "Overall Accuracy"}, inplace=True
)
model_accuracy_df['model'] = model_accuracy_df['model'].apply(lambda x: x.split('/')[-1])
# Ensure all expected difficulty levels are present
expected_levels = [1, 2, 3, 4] # Adjust based on your data
for level in expected_levels:
if level not in model_accuracy_df.columns:
model_accuracy_df[
level
] = None # Fill missing levels with None or an appropriate value
# Rename columns to include levels
level_columns = {level: f"Level {level} Accuracy" for level in expected_levels}
model_accuracy_df.rename(columns=level_columns, inplace=True)
# Multiply by 100 and format to one decimal point
model_accuracy_df = model_accuracy_df.applymap(
lambda x: round(x * 100, 1) if isinstance(x, float) else x
)
# Add headers with icons
model_accuracy_df.columns = [
"π€ Model Name",
"β Overall",
"π Level 1",
"π Level 2",
"π Level 3",
"π¬ Level 4",
]
model_accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
return model_accuracy_df
# categories = array(['1shot', 'CoT', 'Textonly', 'vision', 'vision-CoT'], dtype=object)
accuracy_df_textonly = get_accuracy_dataframe(all_results, "Textonly")
accuracy_df_cot = get_accuracy_dataframe(all_results, "CoT")
accuracy_df_vision = get_accuracy_dataframe(all_results, "vision")
accuracy_df_vision_cot = get_accuracy_dataframe(all_results, "vision-CoT")
accuracy_df_1shot = get_accuracy_dataframe(all_results, "1shot")
# Define the column names with icons
headers_with_icons = [
"π€ Model Name",
"β Overall",
"π Level 1",
"π Level 2",
"π Level 3",
"π¬ Level 4",
]
column_names = [
"Model Name",
"Overall Accuracy",
"Level 1 Accuracy",
"Level 2 Accuracy",
"Level 3 Accuracy",
"Level 4 Accuracy",
]
def load_heatmap_textonly(evt: gr.SelectData):
print(f"./heatmaps/{evt.value}_Textonly.jpg")
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_Textonly.jpg")
return heatmap_image
def load_heatmap_cot(evt: gr.SelectData):
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_CoT.jpg")
return heatmap_image
def load_heatmap_vision(evt: gr.SelectData):
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision.jpg")
return heatmap_image
def load_heatmap_vision_cot(evt: gr.SelectData):
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision-CoT.jpg")
return heatmap_image
def load_heatmap_1shot(evt: gr.SelectData):
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_1shot.jpg")
return heatmap_image
# Then, use these functions in the corresponding select method calls:
with gr.Blocks() as demo:
gr.Markdown("# FSM Benchmark Leaderboard")
# Text-only Benchmark
with gr.Tab("Text-only Benchmark"):
leader_board_textonly = gr.Dataframe(
accuracy_df_textonly, headers=headers_with_icons
)
gr.Markdown("## Heatmap")
heatmap_image_textonly = gr.Image(label="", show_label=False)
leader_board_textonly.select(
fn=load_heatmap_textonly, outputs=[heatmap_image_textonly]
)
# CoT Benchmark
with gr.Tab("CoT Benchmark"):
leader_board_cot = gr.Dataframe(accuracy_df_cot, headers=headers_with_icons)
gr.Markdown("## Heatmap")
heatmap_image_cot = gr.Image(label="", show_label=False)
leader_board_cot.select(fn=load_heatmap_cot, outputs=[heatmap_image_cot])
# Vision Benchmark
with gr.Tab("Vision Benchmark"):
leader_board_vision = gr.Dataframe(
accuracy_df_vision, headers=headers_with_icons
)
gr.Markdown("## Heatmap")
heatmap_image_vision = gr.Image(label="", show_label=False)
leader_board_vision.select(
fn=load_heatmap_vision, outputs=[heatmap_image_vision]
)
# Vision-CoT Benchmark
with gr.Tab("Vision-CoT Benchmark"):
leader_board_vision_cot = gr.Dataframe(
accuracy_df_vision_cot, headers=headers_with_icons
)
gr.Markdown("## Heatmap")
heatmap_image_vision_cot = gr.Image(label="", show_label=False)
leader_board_vision_cot.select(
fn=load_heatmap_vision_cot, outputs=[heatmap_image_vision_cot]
)
# 1shot Benchmark
with gr.Tab("1shot Benchmark"):
leader_board_1shot = gr.Dataframe(accuracy_df_1shot, headers=headers_with_icons)
gr.Markdown("## Heatmap")
heatmap_image_1shot = gr.Image(label="", show_label=False)
leader_board_1shot.select(fn=load_heatmap_1shot, outputs=[heatmap_image_1shot])
demo.launch()
|