|
import json |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import plotly.express as px |
|
import plotly.figure_factory as ff |
|
import plotly.graph_objects as go |
|
import streamlit as st |
|
from plotly.subplots import make_subplots |
|
|
|
from exp_utils import MODELS |
|
from visualize_utils import viridis_rgb |
|
|
|
|
|
|
|
st.set_page_config( |
|
page_title="Results Viewer", |
|
page_icon="π", |
|
initial_sidebar_state="expanded", |
|
layout="wide", |
|
) |
|
|
|
MODELS_SIZE_MAPPING = {k: v["model_size"] for k, v in MODELS.items()} |
|
MODELS_FAMILY_MAPPING = {k: v["model_family"] for k, v in MODELS.items()} |
|
MODEL_FAMILES = set([model["model_family"] for model in MODELS.values()]) |
|
MODEL_NAMES = list(MODELS.keys()) |
|
|
|
MODEL_NAMES_SORTED_BY_NAME_AND_SIZE = sorted( |
|
MODEL_NAMES, key=lambda x: (MODELS[x]["model_family"], MODELS[x]["model_size"]) |
|
) |
|
|
|
MODEL_NAMES_SORTED_BY_SIZE = sorted( |
|
MODEL_NAMES, key=lambda x: (MODELS[x]["model_size"], MODELS[x]["model_family"]) |
|
) |
|
|
|
|
|
|
|
MODELS_SIZE_MAPPING = { |
|
k: v |
|
for k, v in sorted(MODELS_SIZE_MAPPING.items(), key=lambda item: (item[1], item[0])) |
|
} |
|
|
|
MODELS_SIZE_MAPPING_LIST = list(MODELS_SIZE_MAPPING.keys()) |
|
|
|
|
|
CHAT_MODELS = [x for x in MODEL_NAMES_SORTED_BY_NAME_AND_SIZE if MODELS[x]["is_chat"]] |
|
|
|
|
|
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame: |
|
|
|
words_to_remove = [ |
|
"epoch", |
|
"loss", |
|
"runtime", |
|
"samples_per_second", |
|
"steps_per_second", |
|
"samples", |
|
"results_dir", |
|
] |
|
df = df.loc[ |
|
:, |
|
~df.columns.str.contains("|".join(words_to_remove), case=False, regex=True), |
|
] |
|
|
|
|
|
df.columns = df.columns.str.replace("_roc_auc", "") |
|
df.columns = df.columns.str.replace("eval_", "") |
|
|
|
df["model_family"] = df["model_name"].map(MODELS_FAMILY_MAPPING) |
|
|
|
model_family_dict = { |
|
k: v |
|
for k, v in zip( |
|
df["model_name"].values.tolist(), df["model_family"].values.tolist() |
|
) |
|
} |
|
|
|
|
|
df_avg = df.groupby(["model_name"]).mean() |
|
df_std = df.groupby(["model_name"]).std() |
|
|
|
|
|
df_avg = df_avg.drop(columns=["exp_seed"]) |
|
df_std = df_std.drop(columns=["exp_seed"]) |
|
df_avg["model_family"] = df_avg.index.map(model_family_dict) |
|
df_std["model_family"] = df_std.index.map(model_family_dict) |
|
df_avg["model_size"] = df_avg.index.map(MODELS_SIZE_MAPPING) |
|
df_std["model_size"] = df_std.index.map(MODELS_SIZE_MAPPING) |
|
|
|
|
|
df_avg = df_avg.sort_values( |
|
by=["model_family", "model_size"], ascending=[True, True] |
|
) |
|
df_std = df_std.sort_values( |
|
by=["model_family", "model_size"], ascending=[True, True] |
|
) |
|
|
|
availables_rows = [x for x in df_avg.columns if x in df_avg.index] |
|
df_avg = df_avg.reindex(availables_rows) |
|
|
|
availables_rows = [x for x in df_std.columns if x in df_std.index] |
|
df_std = df_std.reindex(availables_rows) |
|
|
|
return df_avg, df_std |
|
|
|
|
|
def get_data(path): |
|
df, df_std = clean_dataframe(pd.read_csv(path, index_col=0)) |
|
return df, df_std |
|
|
|
|
|
def filter_df( |
|
df: pd.DataFrame, |
|
model_family_train: list, |
|
model_family_test: list, |
|
model_size_train: tuple, |
|
model_size_test: tuple, |
|
is_chat_train: bool, |
|
is_chat_test: bool, |
|
sort_by_size: bool, |
|
split_chat_models: bool, |
|
is_debug: bool, |
|
) -> pd.DataFrame: |
|
|
|
|
|
|
|
if is_debug: |
|
st.write("No filters") |
|
st.write(df) |
|
df = df.loc[ |
|
(df["model_size"] >= model_size_train[0] * 1e9) |
|
& (df["model_size"] <= model_size_train[1] * 1e9) |
|
] |
|
if is_debug: |
|
st.write("Filter model size train") |
|
st.write(df) |
|
df = df.loc[df["model_family"].isin(model_family_train)] |
|
if is_debug: |
|
st.write("Filter model family train") |
|
st.write(df) |
|
if is_chat_train != "Both": |
|
df = df.loc[df["is_chat"] == is_chat_train] |
|
if is_debug: |
|
st.write("Filter is chat train") |
|
st.write(df) |
|
|
|
|
|
if is_debug: |
|
st.write("No filters") |
|
st.write(df) |
|
columns_to_keep = [] |
|
for column in df.columns: |
|
if column in MODELS.keys(): |
|
model_size = MODELS[column]["model_size"] |
|
if ( |
|
model_size >= model_size_test[0] * 1e9 |
|
and model_size <= model_size_test[1] * 1e9 |
|
): |
|
columns_to_keep.append(column) |
|
|
|
df = df[list(sorted(list(set(columns_to_keep))))] |
|
if is_debug: |
|
st.write("Filter model size test") |
|
st.write(df) |
|
|
|
|
|
columns_to_keep = [] |
|
for column in df.columns: |
|
for model_family in model_family_test: |
|
if model_family == MODELS[column]["model_family"]: |
|
columns_to_keep.append(column) |
|
df = df[list(sorted(list(set(columns_to_keep))))] |
|
if is_debug: |
|
st.write("Filter model family test") |
|
st.write(df) |
|
|
|
if is_chat_test != "Both": |
|
|
|
columns_to_keep = [] |
|
for column in df.columns: |
|
if MODELS[column]["is_chat"] == is_chat_test: |
|
columns_to_keep.append(column) |
|
df = df[list(sorted(list(set(columns_to_keep))))] |
|
if is_debug: |
|
st.write("Filter is chat test") |
|
st.write(df) |
|
|
|
df = df.select_dtypes(include="number") |
|
if is_debug: |
|
st.write("Select dtypes to be only numbers") |
|
st.write(df) |
|
|
|
if sort_by_size: |
|
columns_in = [x for x in MODEL_NAMES_SORTED_BY_SIZE if x in df.columns] |
|
else: |
|
columns_in = [x for x in MODEL_NAMES_SORTED_BY_NAME_AND_SIZE if x in df.columns] |
|
df = df[columns_in] |
|
if is_debug: |
|
st.write("Sort columns") |
|
st.write(df) |
|
|
|
|
|
if sort_by_size: |
|
availables_rows = [x for x in MODEL_NAMES_SORTED_BY_SIZE if x in df.index] |
|
df = df.reindex(availables_rows) |
|
else: |
|
availables_rows = [ |
|
x for x in MODEL_NAMES_SORTED_BY_NAME_AND_SIZE if x in df.index |
|
] |
|
df = df.reindex(availables_rows) |
|
if is_debug: |
|
st.write("Sort rows") |
|
st.write(df) |
|
|
|
if split_chat_models: |
|
|
|
chat_models = [x for x in CHAT_MODELS if x in df.columns] |
|
|
|
chat_models = sorted(chat_models, key=lambda x: MODELS[x]["model_size"]) |
|
df = df[[x for x in df.columns if x not in chat_models] + chat_models] |
|
|
|
|
|
chat_models = [x for x in CHAT_MODELS if x in df.index] |
|
|
|
chat_models = sorted(chat_models, key=lambda x: MODELS[x]["model_size"]) |
|
df = df.reindex([x for x in df.index if x not in chat_models] + chat_models) |
|
if is_debug: |
|
st.write("Split chat models") |
|
st.write(df) |
|
return df |
|
|
|
|
|
df, df_std = get_data("./deberta_results.csv") |
|
|
|
with open("./ood_results.json", "r") as f: |
|
ood_results = json.load(f) |
|
|
|
ood_results = pd.DataFrame(ood_results) |
|
ood_results = ood_results.set_index("model_name") |
|
ood_results = ood_results.drop( |
|
columns=["exp_name", "accuracy", "f1", "precision", "recall"] |
|
) |
|
ood_results.columns = ["seed", "Adversarial"] |
|
|
|
ood_results_avg = ood_results.groupby(["model_name"]).mean() |
|
ood_results_std = ood_results.groupby(["model_name"]).std() |
|
|
|
|
|
show_diff = st.sidebar.checkbox("Show Diff", value=False) |
|
sort_by_size = st.sidebar.checkbox("Sort by size", value=False) |
|
split_chat_models = st.sidebar.checkbox("Split chat models", value=False) |
|
add_mean = st.sidebar.checkbox("Add mean", value=False) |
|
show_std = st.sidebar.checkbox("Show std", value=False) |
|
model_size_train = st.sidebar.slider( |
|
"Train Model Size in Billion", min_value=0, max_value=100, value=(0, 100), step=1 |
|
) |
|
model_size_test = st.sidebar.slider( |
|
"Test Model Size in Billion", min_value=0, max_value=100, value=(0, 100), step=1 |
|
) |
|
is_chat_train = st.sidebar.selectbox("(Train) Is Chat?", [True, False, "Both"], index=2) |
|
is_chat_test = st.sidebar.selectbox("(Test) Is Chat?", [True, False, "Both"], index=2) |
|
model_family_train = st.sidebar.multiselect( |
|
"Model Family Train", |
|
MODEL_FAMILES, |
|
default=MODEL_FAMILES, |
|
) |
|
model_family_test = st.sidebar.multiselect( |
|
"Model Family Test", |
|
list(MODEL_FAMILES) + ["Adversarial"], |
|
default=MODEL_FAMILES, |
|
) |
|
|
|
add_adversarial = False |
|
if "Adversarial" in model_family_test: |
|
model_family_test.remove("Adversarial") |
|
add_adversarial = True |
|
|
|
sort_by_adversarial = False |
|
if add_adversarial: |
|
sort_by_adversarial = st.sidebar.checkbox("Sort by adversarial", value=False) |
|
|
|
if st.sidebar.checkbox("Use default color scale", value=False): |
|
color_scale = "Viridis_r" |
|
else: |
|
color_scale = viridis_rgb |
|
|
|
|
|
is_debug = st.sidebar.checkbox("Debug", value=False) |
|
|
|
if show_std: |
|
selected_df = df_std.copy() |
|
else: |
|
selected_df = df.copy() |
|
|
|
if show_diff: |
|
|
|
columns_to_keep = ["model_size", "model_family", "is_chat"] |
|
to_be_added = selected_df[columns_to_keep] |
|
selected_df = selected_df.drop(columns=columns_to_keep) |
|
selected_df = selected_df.sub(selected_df.values.diagonal(), axis=1) |
|
selected_df = selected_df.join(to_be_added) |
|
|
|
|
|
filtered_df = filter_df( |
|
selected_df, |
|
model_family_train, |
|
model_family_test, |
|
model_size_train, |
|
model_size_test, |
|
is_chat_train, |
|
is_chat_test, |
|
sort_by_size, |
|
split_chat_models, |
|
is_debug, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if add_adversarial: |
|
filtered_df = filtered_df.join(ood_results_avg) |
|
|
|
if add_mean: |
|
col_mean = filtered_df.mean(axis=1) |
|
row_mean = filtered_df.mean(axis=0) |
|
diag = filtered_df.values.diagonal() |
|
filtered_df["mean"] = col_mean |
|
filtered_df.loc["mean"] = row_mean |
|
|
|
|
|
filtered_df = filtered_df * 100 |
|
filtered_df = filtered_df.round(0) |
|
|
|
|
|
if sort_by_adversarial: |
|
filtered_df = filtered_df.sort_values(by=["Adversarial"], ascending=False) |
|
|
|
|
|
if filtered_df.shape[0] == 0: |
|
st.write("No results found") |
|
st.stop() |
|
|
|
if filtered_df.shape[1] == 0: |
|
st.write("No results found") |
|
st.stop() |
|
|
|
fig = px.imshow( |
|
filtered_df.values, |
|
x=list(filtered_df.columns), |
|
y=list(filtered_df.index), |
|
color_continuous_scale=color_scale, |
|
contrast_rescaling=None, |
|
text_auto=True, |
|
aspect="auto", |
|
) |
|
|
|
|
|
width = st.sidebar.text_input("Width", "1920") |
|
height = st.sidebar.text_input("Height", "1080") |
|
scale = st.sidebar.text_input("Scale", "1.0") |
|
margin = st.sidebar.text_input("Margin[l,r,b,t]", "200,100,100,100") |
|
fig.update_traces(textfont_size=9) |
|
fig.update_layout( |
|
xaxis={"side": "top"}, |
|
yaxis={"side": "left"}, |
|
margin=dict( |
|
l=int(margin.split(",")[0]), |
|
r=int(margin.split(",")[1]), |
|
b=int(margin.split(",")[2]), |
|
t=int(margin.split(",")[3]), |
|
), |
|
font=dict(size=10), |
|
) |
|
fig.update_xaxes(tickangle=45) |
|
|
|
fig.update_xaxes(tickmode="linear") |
|
fig.update_yaxes(tickmode="linear") |
|
|
|
st.plotly_chart(fig, use_container_width=True) |
|
|
|
|
|
if st.sidebar.button("save", key="save"): |
|
fig.write_image( |
|
"fig1.pdf", |
|
width=int(width), |
|
height=int(height), |
|
validate=True, |
|
scale=float(scale), |
|
) |
|
|
|
|
|
|
|
if add_mean and not show_diff: |
|
|
|
if len([x for x in CHAT_MODELS if x in filtered_df.columns]) > 0 or len( |
|
[x for x in CHAT_MODELS if x in filtered_df.index] |
|
): |
|
st.warning( |
|
"Chat models are in the filtered df columns or index." |
|
"This will cause the mean graph to be skewed." |
|
) |
|
|
|
fig3 = px.scatter( |
|
y=row_mean, |
|
x=[MODELS[x]["model_size"] for x in filtered_df.columns if x not in ["mean"]], |
|
|
|
color=[ |
|
MODELS[x]["model_family"] for x in filtered_df.columns if x not in ["mean"] |
|
], |
|
color_discrete_sequence=px.colors.qualitative.Plotly, |
|
title="", |
|
|
|
labels={ |
|
"x": "Target Model Size", |
|
"y": "Average ROC AUC", |
|
"color": "Model Family", |
|
}, |
|
log_x=True, |
|
trendline="ols", |
|
) |
|
fig4 = px.scatter( |
|
y=diag, |
|
x=[MODELS[x]["model_size"] for x in filtered_df.columns if x not in ["mean"]], |
|
|
|
color=[ |
|
MODELS[x]["model_family"] for x in filtered_df.columns if x not in ["mean"] |
|
], |
|
color_discrete_sequence=px.colors.qualitative.Plotly, |
|
title="", |
|
|
|
labels={ |
|
"x": "Target Model Size", |
|
"y": "Self ROC AUC", |
|
"color": "Model Family", |
|
}, |
|
log_x=True, |
|
trendline="ols", |
|
) |
|
|
|
|
|
fig_subplot = make_subplots( |
|
rows=1, |
|
cols=2, |
|
shared_yaxes=False, |
|
subplot_titles=("Self Detection ROC AUC", "Average Target ROC AUC"), |
|
) |
|
for i, figure in enumerate([fig4, fig3]): |
|
for trace in range(len(figure["data"])): |
|
trace_data = figure["data"][trace] |
|
if i == 1: |
|
trace_data["showlegend"] = False |
|
fig_subplot.append_trace(trace_data, row=1, col=i + 1) |
|
|
|
fig_subplot.update_xaxes(type="log") |
|
|
|
fig_subplot.update_yaxes(range=[0.90, 1]) |
|
|
|
fig_subplot.update_layout( |
|
height=500, |
|
width=1200, |
|
) |
|
|
|
fig_subplot.update_layout( |
|
legend=dict(orientation="h", yanchor="bottom", y=-0.2, x=0.09) |
|
) |
|
st.plotly_chart(fig_subplot, use_container_width=True) |
|
|
|
fig2 = px.scatter( |
|
y=col_mean, |
|
x=[MODELS_SIZE_MAPPING[x] for x in filtered_df.index if x not in ["mean"]], |
|
|
|
color=[ |
|
MODELS_FAMILY_MAPPING[x] for x in filtered_df.index if x not in ["mean"] |
|
], |
|
color_discrete_sequence=px.colors.qualitative.Plotly, |
|
title="Mean vs Train Model Size", |
|
log_x=True, |
|
trendline="ols", |
|
) |
|
fig2.update_layout( |
|
height=600, |
|
width=900, |
|
) |
|
st.plotly_chart(fig2, use_container_width=False) |
|
|