wissamantoun's picture
Update app.py
45c8cce
raw
history blame
15.3 kB
import json
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import streamlit as st
from plotly.subplots import make_subplots
from exp_utils import MODELS
from visualize_utils import viridis_rgb
#
st.set_page_config(
page_title="Results Viewer",
page_icon="📊",
initial_sidebar_state="expanded",
layout="wide",
)
MODELS_SIZE_MAPPING = {k: v["model_size"] for k, v in MODELS.items()}
MODELS_FAMILY_MAPPING = {k: v["model_family"] for k, v in MODELS.items()}
MODEL_FAMILES = set([model["model_family"] for model in MODELS.values()])
MODEL_NAMES = list(MODELS.keys())
MODEL_NAMES_SORTED_BY_NAME_AND_SIZE = sorted(
MODEL_NAMES, key=lambda x: (MODELS[x]["model_family"], MODELS[x]["model_size"])
)
MODEL_NAMES_SORTED_BY_SIZE = sorted(
MODEL_NAMES, key=lambda x: (MODELS[x]["model_size"], MODELS[x]["model_family"])
)
# sort MODELS_SIZE_MAPPING by value then by key
MODELS_SIZE_MAPPING = {
k: v
for k, v in sorted(MODELS_SIZE_MAPPING.items(), key=lambda item: (item[1], item[0]))
}
MODELS_SIZE_MAPPING_LIST = list(MODELS_SIZE_MAPPING.keys())
CHAT_MODELS = [x for x in MODEL_NAMES_SORTED_BY_NAME_AND_SIZE if MODELS[x]["is_chat"]]
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
# remove all columns that have "_loss" and "_runtime" in them
words_to_remove = [
"epoch",
"loss",
"runtime",
"samples_per_second",
"steps_per_second",
"samples",
"results_dir",
]
df = df.loc[
:,
~df.columns.str.contains("|".join(words_to_remove), case=False, regex=True),
]
# rename the rest of the columns by replacing "_roc_auc" with ""
df.columns = df.columns.str.replace("_roc_auc", "")
df.columns = df.columns.str.replace("eval_", "")
df["model_family"] = df["model_name"].map(MODELS_FAMILY_MAPPING)
# create a dict with the model_name and the model_family
model_family_dict = {
k: v
for k, v in zip(
df["model_name"].values.tolist(), df["model_family"].values.tolist()
)
}
# average the results over the 5 seeds for each model (seed column is exp_seed)
df_avg = df.groupby(["model_name"]).mean()
df_std = df.groupby(["model_name"]).std()
# remove the exp_seed column
df_avg = df_avg.drop(columns=["exp_seed"])
df_std = df_std.drop(columns=["exp_seed"])
df_avg["model_family"] = df_avg.index.map(model_family_dict)
df_std["model_family"] = df_std.index.map(model_family_dict)
df_avg["model_size"] = df_avg.index.map(MODELS_SIZE_MAPPING)
df_std["model_size"] = df_std.index.map(MODELS_SIZE_MAPPING)
# sort rows by model family then model size
df_avg = df_avg.sort_values(
by=["model_family", "model_size"], ascending=[True, True]
)
df_std = df_std.sort_values(
by=["model_family", "model_size"], ascending=[True, True]
)
availables_rows = [x for x in df_avg.columns if x in df_avg.index]
df_avg = df_avg.reindex(availables_rows)
availables_rows = [x for x in df_std.columns if x in df_std.index]
df_std = df_std.reindex(availables_rows)
return df_avg, df_std
def get_data(path):
df, df_std = clean_dataframe(pd.read_csv(path, index_col=0))
return df, df_std
def filter_df(
df: pd.DataFrame,
model_family_train: list,
model_family_test: list,
model_size_train: tuple,
model_size_test: tuple,
is_chat_train: bool,
is_chat_test: bool,
sort_by_size: bool,
split_chat_models: bool,
is_debug: bool,
) -> pd.DataFrame:
# remove all columns and rows that have "pythia-70m" in the name
# filter rows
if is_debug:
st.write("No filters")
st.write(df)
df = df.loc[
(df["model_size"] >= model_size_train[0] * 1e9)
& (df["model_size"] <= model_size_train[1] * 1e9)
]
if is_debug:
st.write("Filter model size train")
st.write(df)
df = df.loc[df["model_family"].isin(model_family_train)]
if is_debug:
st.write("Filter model family train")
st.write(df)
if is_chat_train != "Both":
df = df.loc[df["is_chat"] == is_chat_train]
if is_debug:
st.write("Filter is chat train")
st.write(df)
# filter columns
if is_debug:
st.write("No filters")
st.write(df)
columns_to_keep = []
for column in df.columns:
if column in MODELS.keys():
model_size = MODELS[column]["model_size"]
if (
model_size >= model_size_test[0] * 1e9
and model_size <= model_size_test[1] * 1e9
):
columns_to_keep.append(column)
df = df[list(sorted(list(set(columns_to_keep))))]
if is_debug:
st.write("Filter model size test")
st.write(df)
# filter columns
columns_to_keep = []
for column in df.columns:
for model_family in model_family_test:
if model_family == MODELS[column]["model_family"]:
columns_to_keep.append(column)
df = df[list(sorted(list(set(columns_to_keep))))]
if is_debug:
st.write("Filter model family test")
st.write(df)
if is_chat_test != "Both":
# filter columns
columns_to_keep = []
for column in df.columns:
if MODELS[column]["is_chat"] == is_chat_test:
columns_to_keep.append(column)
df = df[list(sorted(list(set(columns_to_keep))))]
if is_debug:
st.write("Filter is chat test")
st.write(df)
df = df.select_dtypes(include="number")
if is_debug:
st.write("Select dtypes to be only numbers")
st.write(df)
if sort_by_size:
columns_in = [x for x in MODEL_NAMES_SORTED_BY_SIZE if x in df.columns]
else:
columns_in = [x for x in MODEL_NAMES_SORTED_BY_NAME_AND_SIZE if x in df.columns]
df = df[columns_in]
if is_debug:
st.write("Sort columns")
st.write(df)
# sort rows by size according the MODELS_SIZE_MAPPING_LIST
if sort_by_size:
availables_rows = [x for x in MODEL_NAMES_SORTED_BY_SIZE if x in df.index]
df = df.reindex(availables_rows)
else:
availables_rows = [
x for x in MODEL_NAMES_SORTED_BY_NAME_AND_SIZE if x in df.index
]
df = df.reindex(availables_rows)
if is_debug:
st.write("Sort rows")
st.write(df)
if split_chat_models:
# put chat models at the end of the columns
chat_models = [x for x in CHAT_MODELS if x in df.columns]
# sort chat models by size
chat_models = sorted(chat_models, key=lambda x: MODELS[x]["model_size"])
df = df[[x for x in df.columns if x not in chat_models] + chat_models]
# put chat models at the end of the rows
chat_models = [x for x in CHAT_MODELS if x in df.index]
# sort chat models by size
chat_models = sorted(chat_models, key=lambda x: MODELS[x]["model_size"])
df = df.reindex([x for x in df.index if x not in chat_models] + chat_models)
if is_debug:
st.write("Split chat models")
st.write(df)
return df
df, df_std = get_data("./deberta_results.csv")
with open("./ood_results.json", "r") as f:
ood_results = json.load(f)
ood_results = pd.DataFrame(ood_results)
ood_results = ood_results.set_index("model_name")
ood_results = ood_results.drop(
columns=["exp_name", "accuracy", "f1", "precision", "recall"]
)
ood_results.columns = ["seed", "Adversarial"]
ood_results_avg = ood_results.groupby(["model_name"]).mean()
ood_results_std = ood_results.groupby(["model_name"]).std()
st.write(
"""### Results Viewer 👇
## From Text to Source: Results in Detecting Large Language Model-Generated Content
### Wissam Antoun, Benoît Sagot, Djamé Seddah
##### ALMAnaCH, Inria
##### Paper: [https://arxiv.org/abs/2309.13322](https://arxiv.org/abs/2309.13322)
"""
)
# filters
show_diff = st.sidebar.checkbox("Show Diff", value=False)
sort_by_size = st.sidebar.checkbox("Sort by size", value=False)
split_chat_models = st.sidebar.checkbox("Split chat models", value=False)
add_mean = st.sidebar.checkbox("Add mean", value=False)
show_std = st.sidebar.checkbox("Show std", value=False)
model_size_train = st.sidebar.slider(
"Train Model Size in Billion", min_value=0, max_value=100, value=(0, 100), step=1
)
model_size_test = st.sidebar.slider(
"Test Model Size in Billion", min_value=0, max_value=100, value=(0, 100), step=1
)
is_chat_train = st.sidebar.selectbox("(Train) Is Chat?", [True, False, "Both"], index=2)
is_chat_test = st.sidebar.selectbox("(Test) Is Chat?", [True, False, "Both"], index=2)
model_family_train = st.sidebar.multiselect(
"Model Family Train",
MODEL_FAMILES,
default=MODEL_FAMILES,
)
model_family_test = st.sidebar.multiselect(
"Model Family Test",
list(MODEL_FAMILES) + ["Adversarial"],
default=MODEL_FAMILES,
)
add_adversarial = False
if "Adversarial" in model_family_test:
model_family_test.remove("Adversarial")
add_adversarial = True
sort_by_adversarial = False
if add_adversarial:
sort_by_adversarial = st.sidebar.checkbox("Sort by adversarial", value=False)
if st.sidebar.checkbox("Use default color scale", value=False):
color_scale = "Viridis_r"
else:
color_scale = viridis_rgb
is_debug = st.sidebar.checkbox("Debug", value=False)
if show_std:
selected_df = df_std.copy()
else:
selected_df = df.copy()
if show_diff:
# get those 3 columns {'model_size', 'model_family', 'is_chat'}
columns_to_keep = ["model_size", "model_family", "is_chat"]
to_be_added = selected_df[columns_to_keep]
selected_df = selected_df.drop(columns=columns_to_keep)
selected_df = selected_df.sub(selected_df.values.diagonal(), axis=1)
selected_df = selected_df.join(to_be_added)
filtered_df = filter_df(
selected_df,
model_family_train,
model_family_test,
model_size_train,
model_size_test,
is_chat_train,
is_chat_test,
sort_by_size,
split_chat_models,
is_debug,
)
# subtract each row by the diagonal
# if show_diff:
# filtered_df = filtered_df.sub(filtered_df.values.diagonal(), axis=1)
if add_adversarial:
filtered_df = filtered_df.join(ood_results_avg)
if add_mean:
col_mean = filtered_df.mean(axis=1)
row_mean = filtered_df.mean(axis=0)
diag = filtered_df.values.diagonal()
filtered_df["mean"] = col_mean
filtered_df.loc["mean"] = row_mean
filtered_df = filtered_df * 100
filtered_df = filtered_df.round(0)
# sort by the column called Adversarial
if sort_by_adversarial:
filtered_df = filtered_df.sort_values(by=["Adversarial"], ascending=False)
# check if the df has columns and rows
if filtered_df.shape[0] == 0:
st.write("No results found")
st.stop()
if filtered_df.shape[1] == 0:
st.write("No results found")
st.stop()
fig = px.imshow(
filtered_df.values,
x=list(filtered_df.columns),
y=list(filtered_df.index),
color_continuous_scale=color_scale,
contrast_rescaling=None,
text_auto=True,
aspect="auto",
)
# width = st.sidebar.text_input("Width", "1920")
# height = st.sidebar.text_input("Height", "1080")
# scale = st.sidebar.text_input("Scale", "1.0")
# margin = st.sidebar.text_input("Margin[l,r,b,t]", "200,100,100,100")
fig.update_traces(textfont_size=9)
fig.update_layout(
xaxis={"side": "top"},
yaxis={"side": "left"},
# margin=dict(
# l=int(margin.split(",")[0]),
# r=int(margin.split(",")[1]),
# b=int(margin.split(",")[2]),
# t=int(margin.split(",")[3]),
# ),
font=dict(size=10),
)
fig.update_xaxes(tickangle=45)
fig.update_xaxes(tickmode="linear")
fig.update_yaxes(tickmode="linear")
# change the font in the heatmap
st.plotly_chart(fig, use_container_width=True)
# if st.sidebar.button("save", key="save"):
# fig.write_image(
# "fig1.pdf",
# width=int(width),
# height=int(height),
# validate=True,
# scale=float(scale),
# )
# plot the col mean vs model size
if add_mean and not show_diff:
# check if any of the chat models are in the filtered df columns and index
if len([x for x in CHAT_MODELS if x in filtered_df.columns]) > 0 or len(
[x for x in CHAT_MODELS if x in filtered_df.index]
):
st.warning(
"Chat models are in the filtered df columns or index."
"This will cause the mean graph to be skewed."
)
fig3 = px.scatter(
y=row_mean,
x=[MODELS[x]["model_size"] for x in filtered_df.columns if x not in ["mean"]],
# hover_data=[x for x in filtered_df.index if x not in ["mean"]],
color=[
MODELS[x]["model_family"] for x in filtered_df.columns if x not in ["mean"]
],
color_discrete_sequence=px.colors.qualitative.Plotly,
title="",
# x axis title
labels={
"x": "Target Model Size",
"y": "Average ROC AUC",
"color": "Model Family",
},
log_x=True,
trendline="ols",
)
fig4 = px.scatter(
y=diag,
x=[MODELS[x]["model_size"] for x in filtered_df.columns if x not in ["mean"]],
# hover_data=[x for x in filtered_df.index if x not in ["mean"]],
color=[
MODELS[x]["model_family"] for x in filtered_df.columns if x not in ["mean"]
],
color_discrete_sequence=px.colors.qualitative.Plotly,
title="",
# x axis title
labels={
"x": "Target Model Size",
"y": "Self ROC AUC",
"color": "Model Family",
},
log_x=True,
trendline="ols",
)
# put the two plots side by side
fig_subplot = make_subplots(
rows=1,
cols=2,
shared_yaxes=False,
subplot_titles=("Self Detection ROC AUC", "Average Target ROC AUC"),
)
for i, figure in enumerate([fig4, fig3]):
for trace in range(len(figure["data"])):
trace_data = figure["data"][trace]
if i == 1:
trace_data["showlegend"] = False
fig_subplot.append_trace(trace_data, row=1, col=i + 1)
fig_subplot.update_xaxes(type="log")
# y axis range
fig_subplot.update_yaxes(range=[0.90, 1])
fig_subplot.update_layout(
height=500,
width=1200,
)
# put the legend on the bottom
fig_subplot.update_layout(
legend=dict(orientation="h", yanchor="bottom", y=-0.2, x=0.09)
)
st.plotly_chart(fig_subplot, use_container_width=True)
fig2 = px.scatter(
y=col_mean,
x=[MODELS_SIZE_MAPPING[x] for x in filtered_df.index if x not in ["mean"]],
# hover_data=[x for x in filtered_df.index if x not in ["mean"]],
color=[
MODELS_FAMILY_MAPPING[x] for x in filtered_df.index if x not in ["mean"]
],
color_discrete_sequence=px.colors.qualitative.Plotly,
title="Mean vs Train Model Size",
log_x=True,
trendline="ols",
)
fig2.update_layout(
height=600,
width=900,
)
st.plotly_chart(fig2, use_container_width=False)