Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
@@ -1,7 +1,11 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
from glob import glob
|
4 |
-
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# Load text benchmark results
|
7 |
csv_results = glob("results/*.pkl")
|
@@ -30,9 +34,7 @@ cot_text_data = load_data(cot_text_results, "CoT Text Only")
|
|
30 |
# cot_vision_data = load_data(cot_vision_results, "CoT Vision")
|
31 |
|
32 |
# Combine all data into a single DataFrame
|
33 |
-
all_data = pd.concat(
|
34 |
-
[data, vision_data, cot_text_data], ignore_index=True
|
35 |
-
)
|
36 |
|
37 |
all_model_names = all_data["Model Name"].unique()
|
38 |
all_text_only_model_names = list(
|
@@ -43,10 +45,13 @@ all_cot_text_only_models = list(
|
|
43 |
)
|
44 |
|
45 |
|
|
|
|
|
46 |
|
47 |
## Continue with the cold code --
|
48 |
# TODO: Update me to read from all_data for later
|
49 |
|
|
|
50 |
# Load the csv files into a dict with keys being name of the file and values being the data
|
51 |
data = {file: pd.read_pickle(file) for file in csv_results}
|
52 |
# Load the vision files into a dict
|
@@ -145,7 +150,7 @@ def load_cot_vision_heatmap(evt: gr.SelectData):
|
|
145 |
|
146 |
|
147 |
def calculate_order_by_first_substring(selected_models):
|
148 |
-
|
149 |
first_columns = all_data[all_data["substring_index"] == 1]
|
150 |
query_ids_df = first_columns[first_columns["Model Type"] == "Text Only"]
|
151 |
query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
|
@@ -158,6 +163,7 @@ def calculate_order_by_first_substring(selected_models):
|
|
158 |
|
159 |
text_only = all_data[all_data["Model Type"] == "Text Only"]
|
160 |
text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
|
|
|
161 |
|
162 |
query_ids = text_only_filtered.query_id.unique()
|
163 |
text_only_filtered = (
|
@@ -180,9 +186,8 @@ def calculate_order_by_first_substring(selected_models):
|
|
180 |
return text_only_filtered, number_of_queries, number_of_fsms
|
181 |
|
182 |
|
183 |
-
|
184 |
def calculate_order_by_first_substring_cot(selected_models):
|
185 |
-
|
186 |
first_columns = all_data[all_data["substring_index"] == 1]
|
187 |
query_ids_df = first_columns[first_columns["Model Type"] == "CoT Text Only"]
|
188 |
query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
|
@@ -195,6 +200,7 @@ def calculate_order_by_first_substring_cot(selected_models):
|
|
195 |
|
196 |
text_only = all_data[all_data["Model Type"] == "CoT Text Only"]
|
197 |
text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
|
|
|
198 |
|
199 |
query_ids = text_only_filtered.query_id.unique()
|
200 |
text_only_filtered = (
|
@@ -217,6 +223,108 @@ def calculate_order_by_first_substring_cot(selected_models):
|
|
217 |
return text_only_filtered, number_of_queries, number_of_fsms
|
218 |
|
219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
with gr.Blocks() as demo:
|
221 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
222 |
with gr.Tab("Text-only Benchmark"):
|
@@ -273,6 +381,7 @@ with gr.Blocks() as demo:
|
|
273 |
number_of_fsms = gr.Textbox(label="Number of included FSMs")
|
274 |
|
275 |
constrained_leader_board_text = gr.Dataframe()
|
|
|
276 |
|
277 |
included_models.select(
|
278 |
fn=calculate_order_by_first_substring,
|
@@ -281,7 +390,6 @@ with gr.Blocks() as demo:
|
|
281 |
queue=True,
|
282 |
)
|
283 |
|
284 |
-
|
285 |
with gr.Tab("Constraint Text-only Results (CoT)"):
|
286 |
gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
|
287 |
included_models_cot = gr.CheckboxGroup(
|
@@ -295,12 +403,25 @@ with gr.Blocks() as demo:
|
|
295 |
number_of_fsms_cot = gr.Textbox(label="Number of included FSMs")
|
296 |
|
297 |
constrained_leader_board_text_cot = gr.Dataframe()
|
|
|
298 |
|
299 |
included_models_cot.select(
|
300 |
fn=calculate_order_by_first_substring_cot,
|
301 |
inputs=[included_models_cot],
|
302 |
-
outputs=[
|
|
|
|
|
|
|
|
|
303 |
queue=True,
|
304 |
)
|
305 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
306 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
from glob import glob
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
from matplotlib.colors import ListedColormap, BoundaryNorm
|
7 |
+
from glob import glob
|
8 |
+
import os
|
9 |
|
10 |
# Load text benchmark results
|
11 |
csv_results = glob("results/*.pkl")
|
|
|
34 |
# cot_vision_data = load_data(cot_vision_results, "CoT Vision")
|
35 |
|
36 |
# Combine all data into a single DataFrame
|
37 |
+
all_data = pd.concat([data, vision_data, cot_text_data], ignore_index=True)
|
|
|
|
|
38 |
|
39 |
all_model_names = all_data["Model Name"].unique()
|
40 |
all_text_only_model_names = list(
|
|
|
45 |
)
|
46 |
|
47 |
|
48 |
+
text_only_filtered_raw = None
|
49 |
+
text_only_filtered_raw_cot = None
|
50 |
|
51 |
## Continue with the cold code --
|
52 |
# TODO: Update me to read from all_data for later
|
53 |
|
54 |
+
|
55 |
# Load the csv files into a dict with keys being name of the file and values being the data
|
56 |
data = {file: pd.read_pickle(file) for file in csv_results}
|
57 |
# Load the vision files into a dict
|
|
|
150 |
|
151 |
|
152 |
def calculate_order_by_first_substring(selected_models):
|
153 |
+
global text_only_filtered_raw
|
154 |
first_columns = all_data[all_data["substring_index"] == 1]
|
155 |
query_ids_df = first_columns[first_columns["Model Type"] == "Text Only"]
|
156 |
query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
|
|
|
163 |
|
164 |
text_only = all_data[all_data["Model Type"] == "Text Only"]
|
165 |
text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
|
166 |
+
text_only_filtered_raw = text_only_filtered.copy()
|
167 |
|
168 |
query_ids = text_only_filtered.query_id.unique()
|
169 |
text_only_filtered = (
|
|
|
186 |
return text_only_filtered, number_of_queries, number_of_fsms
|
187 |
|
188 |
|
|
|
189 |
def calculate_order_by_first_substring_cot(selected_models):
|
190 |
+
global text_only_filtered_raw_cot
|
191 |
first_columns = all_data[all_data["substring_index"] == 1]
|
192 |
query_ids_df = first_columns[first_columns["Model Type"] == "CoT Text Only"]
|
193 |
query_ids_df = query_ids_df[query_ids_df["Model Name"].isin(selected_models)]
|
|
|
200 |
|
201 |
text_only = all_data[all_data["Model Type"] == "CoT Text Only"]
|
202 |
text_only_filtered = text_only[text_only["fsm_id"].isin(fsm_ids)]
|
203 |
+
text_only_filtered_raw_cot = text_only_filtered.copy()
|
204 |
|
205 |
query_ids = text_only_filtered.query_id.unique()
|
206 |
text_only_filtered = (
|
|
|
223 |
return text_only_filtered, number_of_queries, number_of_fsms
|
224 |
|
225 |
|
226 |
+
def generate_heatmap_for_specific_model(model_name):
|
227 |
+
global text_only_filtered_raw
|
228 |
+
|
229 |
+
cmap = ListedColormap(["lightblue", "red", "green"])
|
230 |
+
bounds = [-1.5, -0.5, 0.5, 1.5]
|
231 |
+
norm = BoundaryNorm(bounds, cmap.N)
|
232 |
+
|
233 |
+
model_df = text_only_filtered_raw[
|
234 |
+
text_only_filtered_raw["Model Name"] == model_name
|
235 |
+
]
|
236 |
+
model_df["fsm_info"] = model_df.apply(
|
237 |
+
lambda x: f"{x['num_states']} states, {x['num_alphabet']} alphabet", axis=1
|
238 |
+
)
|
239 |
+
model_df = model_df.sort_values(by=["num_states", "num_alphabet"])
|
240 |
+
|
241 |
+
pivot_df = (
|
242 |
+
model_df.pivot_table(
|
243 |
+
index="fsm_info",
|
244 |
+
columns="substring_index",
|
245 |
+
values="parsed_judge_response",
|
246 |
+
aggfunc="first",
|
247 |
+
)
|
248 |
+
.fillna(-1)
|
249 |
+
.astype(float)
|
250 |
+
)
|
251 |
+
|
252 |
+
# plt.figure(figsize=(12, 8))
|
253 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
254 |
+
sns.heatmap(
|
255 |
+
pivot_df,
|
256 |
+
cmap=cmap,
|
257 |
+
linewidths=1,
|
258 |
+
linecolor="black",
|
259 |
+
norm=norm,
|
260 |
+
cbar=False,
|
261 |
+
square=True,
|
262 |
+
ax=ax,
|
263 |
+
)
|
264 |
+
plt.title(f"Heatmap for Model: {model_name}", fontsize=20, weight="bold")
|
265 |
+
plt.xlabel("Substring Index")
|
266 |
+
plt.ylabel("FSM (States, Alphabet)")
|
267 |
+
plt.xticks(rotation=45)
|
268 |
+
|
269 |
+
return fig
|
270 |
+
|
271 |
+
|
272 |
+
def generate_heatmap_for_specific_model_cot(model_name):
|
273 |
+
global text_only_filtered_raw
|
274 |
+
|
275 |
+
cmap = ListedColormap(["lightblue", "red", "green"])
|
276 |
+
bounds = [-1.5, -0.5, 0.5, 1.5]
|
277 |
+
norm = BoundaryNorm(bounds, cmap.N)
|
278 |
+
|
279 |
+
model_df = text_only_filtered_raw_cot[
|
280 |
+
text_only_filtered_raw_cot["Model Name"] == model_name
|
281 |
+
]
|
282 |
+
model_df["fsm_info"] = model_df.apply(
|
283 |
+
lambda x: f"{x['num_states']} states, {x['num_alphabet']} alphabet", axis=1
|
284 |
+
)
|
285 |
+
model_df = model_df.sort_values(by=["num_states", "num_alphabet"])
|
286 |
+
|
287 |
+
pivot_df = (
|
288 |
+
model_df.pivot_table(
|
289 |
+
index="fsm_info",
|
290 |
+
columns="substring_index",
|
291 |
+
values="parsed_judge_response",
|
292 |
+
aggfunc="first",
|
293 |
+
)
|
294 |
+
.fillna(-1)
|
295 |
+
.astype(float)
|
296 |
+
)
|
297 |
+
|
298 |
+
# plt.figure(figsize=(12, 8))
|
299 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
300 |
+
sns.heatmap(
|
301 |
+
pivot_df,
|
302 |
+
cmap=cmap,
|
303 |
+
linewidths=1,
|
304 |
+
linecolor="black",
|
305 |
+
norm=norm,
|
306 |
+
cbar=False,
|
307 |
+
square=True,
|
308 |
+
ax=ax,
|
309 |
+
)
|
310 |
+
plt.title(f"Heatmap for Model: {model_name}", fontsize=20, weight="bold")
|
311 |
+
plt.xlabel("Substring Index")
|
312 |
+
plt.ylabel("FSM (States, Alphabet)")
|
313 |
+
plt.xticks(rotation=45)
|
314 |
+
|
315 |
+
return fig
|
316 |
+
|
317 |
+
|
318 |
+
def show_constraint_heatmap(evt: gr.SelectData):
|
319 |
+
model_name = evt.value
|
320 |
+
return generate_heatmap_for_specific_model(model_name)
|
321 |
+
|
322 |
+
|
323 |
+
def show_constraint_heatmap_cot(evt: gr.SelectData):
|
324 |
+
model_name = evt.value
|
325 |
+
return generate_heatmap_for_specific_model_cot(model_name)
|
326 |
+
|
327 |
+
|
328 |
with gr.Blocks() as demo:
|
329 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
330 |
with gr.Tab("Text-only Benchmark"):
|
|
|
381 |
number_of_fsms = gr.Textbox(label="Number of included FSMs")
|
382 |
|
383 |
constrained_leader_board_text = gr.Dataframe()
|
384 |
+
constrained_leader_board_plot = gr.Plot()
|
385 |
|
386 |
included_models.select(
|
387 |
fn=calculate_order_by_first_substring,
|
|
|
390 |
queue=True,
|
391 |
)
|
392 |
|
|
|
393 |
with gr.Tab("Constraint Text-only Results (CoT)"):
|
394 |
gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
|
395 |
included_models_cot = gr.CheckboxGroup(
|
|
|
403 |
number_of_fsms_cot = gr.Textbox(label="Number of included FSMs")
|
404 |
|
405 |
constrained_leader_board_text_cot = gr.Dataframe()
|
406 |
+
constrained_leader_board_plot_cot = gr.Plot()
|
407 |
|
408 |
included_models_cot.select(
|
409 |
fn=calculate_order_by_first_substring_cot,
|
410 |
inputs=[included_models_cot],
|
411 |
+
outputs=[
|
412 |
+
constrained_leader_board_text_cot,
|
413 |
+
number_of_queries_cot,
|
414 |
+
number_of_fsms_cot,
|
415 |
+
],
|
416 |
queue=True,
|
417 |
)
|
418 |
|
419 |
+
constrained_leader_board_text.select(
|
420 |
+
fn=show_constraint_heatmap, outputs=[constrained_leader_board_plot]
|
421 |
+
)
|
422 |
+
|
423 |
+
constrained_leader_board_text_cot.select(
|
424 |
+
fn=show_constraint_heatmap_cot, outputs=[constrained_leader_board_plot_cot]
|
425 |
+
)
|
426 |
+
|
427 |
demo.launch()
|