Spaces:

FSMBench
/

Leaderboard

Sleeping

App Files Files Community

taesiri commited on Apr 14

Commit

75d681a

•

1 Parent(s): e0656c6

update

Browse files

Files changed (1) hide show

app.py +130 -9

app.py CHANGED Viewed

@@ -1,7 +1,11 @@
 import gradio as gr
 import pandas as pd
 from glob import glob
 # Load text benchmark results
 csv_results = glob("results/*.pkl")
@@ -30,9 +34,7 @@ cot_text_data = load_data(cot_text_results, "CoT Text Only")
 # cot_vision_data = load_data(cot_vision_results, "CoT Vision")
 # Combine all data into a single DataFrame
-all_data = pd.concat(
-    [data, vision_data, cot_text_data], ignore_index=True
-)
 all_model_names = all_data["Model Name"].unique()
 all_text_only_model_names = list(
@@ -43,10 +45,13 @@ all_cot_text_only_models = list(
 )
 ## Continue with the cold code --
 # TODO: Update me to read from all_data for later
 # Load the csv files into a dict with keys being name of the file and values being the data
 data = {file: pd.read_pickle(file) for file in csv_results}
 # Load the vision files into a dict
@@ -145,7 +150,7 @@ def load_cot_vision_heatmap(evt: gr.SelectData):
 def calculate_order_by_first_substring(selected_models):
     first_columns = all_data[all_data["substring_index"] == 1]
     query_ids_df = first_columns[first_columns["Model Type"] == "Text Only"]
     query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
@@ -158,6 +163,7 @@ def calculate_order_by_first_substring(selected_models):
     text_only = all_data[all_data["Model Type"] == "Text Only"]
     text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
     query_ids = text_only_filtered.query_id.unique()
     text_only_filtered = (
@@ -180,9 +186,8 @@ def calculate_order_by_first_substring(selected_models):
     return text_only_filtered, number_of_queries, number_of_fsms
 def calculate_order_by_first_substring_cot(selected_models):
     first_columns = all_data[all_data["substring_index"] == 1]
     query_ids_df = first_columns[first_columns["Model Type"] == "CoT Text Only"]
     query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
@@ -195,6 +200,7 @@ def calculate_order_by_first_substring_cot(selected_models):
     text_only = all_data[all_data["Model Type"] == "CoT Text Only"]
     text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
     query_ids = text_only_filtered.query_id.unique()
     text_only_filtered = (
@@ -217,6 +223,108 @@ def calculate_order_by_first_substring_cot(selected_models):
     return text_only_filtered, number_of_queries, number_of_fsms
 with gr.Blocks() as demo:
     gr.Markdown("# FSM Benchmark Leaderboard")
     with gr.Tab("Text-only Benchmark"):
@@ -273,6 +381,7 @@ with gr.Blocks() as demo:
             number_of_fsms = gr.Textbox(label="Number of included  FSMs")
         constrained_leader_board_text = gr.Dataframe()
     included_models.select(
         fn=calculate_order_by_first_substring,
@@ -281,7 +390,6 @@ with gr.Blocks() as demo:
         queue=True,
     )
     with gr.Tab("Constraint Text-only Results (CoT)"):
         gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
         included_models_cot = gr.CheckboxGroup(
@@ -295,12 +403,25 @@ with gr.Blocks() as demo:
             number_of_fsms_cot = gr.Textbox(label="Number of included  FSMs")
         constrained_leader_board_text_cot = gr.Dataframe()
     included_models_cot.select(
         fn=calculate_order_by_first_substring_cot,
         inputs=[included_models_cot],
-        outputs=[constrained_leader_board_text_cot, number_of_queries_cot, number_of_fsms_cot],
         queue=True,
     )
     demo.launch()

 import gradio as gr
 import pandas as pd
 from glob import glob
+import matplotlib.pyplot as plt
+import seaborn as sns
+from matplotlib.colors import ListedColormap, BoundaryNorm
+from glob import glob
+import os
 # Load text benchmark results
 csv_results = glob("results/*.pkl")
 # cot_vision_data = load_data(cot_vision_results, "CoT Vision")
 # Combine all data into a single DataFrame
+all_data = pd.concat([data, vision_data, cot_text_data], ignore_index=True)
 all_model_names = all_data["Model Name"].unique()
 all_text_only_model_names = list(
 )
+text_only_filtered_raw = None
+text_only_filtered_raw_cot = None
 ## Continue with the cold code --
 # TODO: Update me to read from all_data for later
 # Load the csv files into a dict with keys being name of the file and values being the data
 data = {file: pd.read_pickle(file) for file in csv_results}
 # Load the vision files into a dict
 def calculate_order_by_first_substring(selected_models):
+    global text_only_filtered_raw
     first_columns = all_data[all_data["substring_index"] == 1]
     query_ids_df = first_columns[first_columns["Model Type"] == "Text Only"]
     query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
     text_only = all_data[all_data["Model Type"] == "Text Only"]
     text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
+    text_only_filtered_raw = text_only_filtered.copy()
     query_ids = text_only_filtered.query_id.unique()
     text_only_filtered = (
     return text_only_filtered, number_of_queries, number_of_fsms
 def calculate_order_by_first_substring_cot(selected_models):
+    global text_only_filtered_raw_cot
     first_columns = all_data[all_data["substring_index"] == 1]
     query_ids_df = first_columns[first_columns["Model Type"] == "CoT Text Only"]
     query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
     text_only = all_data[all_data["Model Type"] == "CoT Text Only"]
     text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
+    text_only_filtered_raw_cot = text_only_filtered.copy()
     query_ids = text_only_filtered.query_id.unique()
     text_only_filtered = (
     return text_only_filtered, number_of_queries, number_of_fsms
+def generate_heatmap_for_specific_model(model_name):
+    global text_only_filtered_raw
+    cmap = ListedColormap(["lightblue", "red", "green"])
+    bounds = [-1.5, -0.5, 0.5, 1.5]
+    norm = BoundaryNorm(bounds, cmap.N)
+    model_df = text_only_filtered_raw[
+        text_only_filtered_raw["Model Name"] == model_name
+    ]
+    model_df["fsm_info"] = model_df.apply(
+        lambda x: f"{x['num_states']} states, {x['num_alphabet']} alphabet", axis=1
+    )
+    model_df = model_df.sort_values(by=["num_states", "num_alphabet"])
+    pivot_df = (
+        model_df.pivot_table(
+            index="fsm_info",
+            columns="substring_index",
+            values="parsed_judge_response",
+            aggfunc="first",
+        )
+        .fillna(-1)
+        .astype(float)
+    )
+    # plt.figure(figsize=(12, 8))
+    fig, ax = plt.subplots(figsize=(12, 8))
+    sns.heatmap(
+        pivot_df,
+        cmap=cmap,
+        linewidths=1,
+        linecolor="black",
+        norm=norm,
+        cbar=False,
+        square=True,
+        ax=ax,
+    )
+    plt.title(f"Heatmap for Model: {model_name}", fontsize=20, weight="bold")
+    plt.xlabel("Substring Index")
+    plt.ylabel("FSM (States, Alphabet)")
+    plt.xticks(rotation=45)
+    return fig
+def generate_heatmap_for_specific_model_cot(model_name):
+    global text_only_filtered_raw
+    cmap = ListedColormap(["lightblue", "red", "green"])
+    bounds = [-1.5, -0.5, 0.5, 1.5]
+    norm = BoundaryNorm(bounds, cmap.N)
+    model_df = text_only_filtered_raw_cot[
+        text_only_filtered_raw_cot["Model Name"] == model_name
+    ]
+    model_df["fsm_info"] = model_df.apply(
+        lambda x: f"{x['num_states']} states, {x['num_alphabet']} alphabet", axis=1
+    )
+    model_df = model_df.sort_values(by=["num_states", "num_alphabet"])
+    pivot_df = (
+        model_df.pivot_table(
+            index="fsm_info",
+            columns="substring_index",
+            values="parsed_judge_response",
+            aggfunc="first",
+        )
+        .fillna(-1)
+        .astype(float)
+    )
+    # plt.figure(figsize=(12, 8))
+    fig, ax = plt.subplots(figsize=(12, 8))
+    sns.heatmap(
+        pivot_df,
+        cmap=cmap,
+        linewidths=1,
+        linecolor="black",
+        norm=norm,
+        cbar=False,
+        square=True,
+        ax=ax,
+    )
+    plt.title(f"Heatmap for Model: {model_name}", fontsize=20, weight="bold")
+    plt.xlabel("Substring Index")
+    plt.ylabel("FSM (States, Alphabet)")
+    plt.xticks(rotation=45)
+    return fig
+def show_constraint_heatmap(evt: gr.SelectData):
+    model_name = evt.value
+    return generate_heatmap_for_specific_model(model_name)
+def show_constraint_heatmap_cot(evt: gr.SelectData):
+    model_name = evt.value
+    return generate_heatmap_for_specific_model_cot(model_name)
 with gr.Blocks() as demo:
     gr.Markdown("# FSM Benchmark Leaderboard")
     with gr.Tab("Text-only Benchmark"):
             number_of_fsms = gr.Textbox(label="Number of included  FSMs")
         constrained_leader_board_text = gr.Dataframe()
+        constrained_leader_board_plot = gr.Plot()
     included_models.select(
         fn=calculate_order_by_first_substring,
         queue=True,
     )
     with gr.Tab("Constraint Text-only Results (CoT)"):
         gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
         included_models_cot = gr.CheckboxGroup(
             number_of_fsms_cot = gr.Textbox(label="Number of included  FSMs")
         constrained_leader_board_text_cot = gr.Dataframe()
+        constrained_leader_board_plot_cot = gr.Plot()
     included_models_cot.select(
         fn=calculate_order_by_first_substring_cot,
         inputs=[included_models_cot],
+        outputs=[
+            constrained_leader_board_text_cot,
+            number_of_queries_cot,
+            number_of_fsms_cot,
+        ],
         queue=True,
     )
+    constrained_leader_board_text.select(
+        fn=show_constraint_heatmap, outputs=[constrained_leader_board_plot]
+    )
+    constrained_leader_board_text_cot.select(
+        fn=show_constraint_heatmap_cot, outputs=[constrained_leader_board_plot_cot]
+    )
     demo.launch()