Leaderboard / app.py
taesiri's picture
update
1dca33f
raw
history blame
5.95 kB
import os
from glob import glob
import gradio as gr
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.colors import BoundaryNorm, ListedColormap
all_results = pd.read_pickle("final_df.pkl")
def get_accuracy_dataframe(df_mother, category):
# Calculate overall model accuracy
# filter for category only
df = df_mother[df_mother["category"] == category].copy()
df["is_answer_correct"] = df["is_answer_correct"].astype(float)
model_accuracy = df.groupby("model")["is_answer_correct"].mean().reset_index()
# Calculate model accuracy per difficulty level
df["difficulty_level"] = df["difficulty_level"].astype(int)
model_accuracy_per_level = (
df.groupby(["model", "difficulty_level"])["is_answer_correct"]
.mean()
.reset_index()
)
model_accuracy_per_level_df = model_accuracy_per_level.pivot(
index="model", columns="difficulty_level", values="is_answer_correct"
)
# Merge overall accuracy and level-based accuracy into a single DataFrame
model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on="model")
model_accuracy_df.rename(
columns={"is_answer_correct": "Overall Accuracy"}, inplace=True
)
model_accuracy_df['model'] = model_accuracy_df['model'].apply(lambda x: x.split('/')[-1])
# Ensure all expected difficulty levels are present
expected_levels = [1, 2, 3, 4] # Adjust based on your data
for level in expected_levels:
if level not in model_accuracy_df.columns:
model_accuracy_df[
level
] = None # Fill missing levels with None or an appropriate value
# Rename columns to include levels
level_columns = {level: f"Level {level} Accuracy" for level in expected_levels}
model_accuracy_df.rename(columns=level_columns, inplace=True)
# Multiply by 100 and format to one decimal point
model_accuracy_df = model_accuracy_df.applymap(
lambda x: round(x * 100, 1) if isinstance(x, float) else x
)
# Add headers with icons
model_accuracy_df.columns = [
"πŸ€– Model Name",
"⭐ Overall",
"πŸ“ˆ Level 1",
"πŸ” Level 2",
"πŸ“˜ Level 3",
"πŸ”¬ Level 4",
]
model_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
return model_accuracy_df
# categories = array(['1shot', 'CoT', 'Textonly', 'vision', 'vision-CoT'], dtype=object)
accuracy_df_textonly = get_accuracy_dataframe(all_results, "Textonly")
accuracy_df_cot = get_accuracy_dataframe(all_results, "CoT")
accuracy_df_vision = get_accuracy_dataframe(all_results, "vision")
accuracy_df_vision_cot = get_accuracy_dataframe(all_results, "vision-CoT")
accuracy_df_1shot = get_accuracy_dataframe(all_results, "1shot")
# Define the column names with icons
headers_with_icons = [
"πŸ€– Model Name",
"⭐ Overall",
"πŸ“ˆ Level 1",
"πŸ” Level 2",
"πŸ“˜ Level 3",
"πŸ”¬ Level 4",
]
column_names = [
"Model Name",
"Overall Accuracy",
"Level 1 Accuracy",
"Level 2 Accuracy",
"Level 3 Accuracy",
"Level 4 Accuracy",
]
def load_heatmap_textonly(evt: gr.SelectData):
print(f"./heatmaps/{evt.value}_Textonly.jpg")
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_Textonly.jpg")
return heatmap_image
def load_heatmap_cot(evt: gr.SelectData):
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_CoT.jpg")
return heatmap_image
def load_heatmap_vision(evt: gr.SelectData):
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision.jpg")
return heatmap_image
def load_heatmap_vision_cot(evt: gr.SelectData):
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision-CoT.jpg")
return heatmap_image
def load_heatmap_1shot(evt: gr.SelectData):
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_1shot.jpg")
return heatmap_image
# Then, use these functions in the corresponding select method calls:
with gr.Blocks() as demo:
gr.Markdown("# FSM Benchmark Leaderboard")
# Text-only Benchmark
with gr.Tab("Text-only Benchmark"):
leader_board_textonly = gr.Dataframe(
accuracy_df_textonly, headers=headers_with_icons
)
gr.Markdown("## Heatmap")
heatmap_image_textonly = gr.Image(label="", show_label=False)
leader_board_textonly.select(
fn=load_heatmap_textonly, outputs=[heatmap_image_textonly]
)
# CoT Benchmark
with gr.Tab("CoT Benchmark"):
leader_board_cot = gr.Dataframe(accuracy_df_cot, headers=headers_with_icons)
gr.Markdown("## Heatmap")
heatmap_image_cot = gr.Image(label="", show_label=False)
leader_board_cot.select(fn=load_heatmap_cot, outputs=[heatmap_image_cot])
# Vision Benchmark
with gr.Tab("Vision Benchmark"):
leader_board_vision = gr.Dataframe(
accuracy_df_vision, headers=headers_with_icons
)
gr.Markdown("## Heatmap")
heatmap_image_vision = gr.Image(label="", show_label=False)
leader_board_vision.select(
fn=load_heatmap_vision, outputs=[heatmap_image_vision]
)
# Vision-CoT Benchmark
with gr.Tab("Vision-CoT Benchmark"):
leader_board_vision_cot = gr.Dataframe(
accuracy_df_vision_cot, headers=headers_with_icons
)
gr.Markdown("## Heatmap")
heatmap_image_vision_cot = gr.Image(label="", show_label=False)
leader_board_vision_cot.select(
fn=load_heatmap_vision_cot, outputs=[heatmap_image_vision_cot]
)
# 1shot Benchmark
with gr.Tab("1shot Benchmark"):
leader_board_1shot = gr.Dataframe(accuracy_df_1shot, headers=headers_with_icons)
gr.Markdown("## Heatmap")
heatmap_image_1shot = gr.Image(label="", show_label=False)
leader_board_1shot.select(fn=load_heatmap_1shot, outputs=[heatmap_image_1shot])
demo.launch()