virus_explorer / app.py
Hack90's picture
Update app.py
3590429 verified
raw
history blame
10.2 kB
import pandas as pd
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from utils import (
filter_and_select,
plot_2d_comparison,
plot_color_square,
wens_method_heatmap,
plot_fcgr,
plot_persistence_homology,
)
############################################################# Virus Dataset ########################################################
#ds = load_dataset('Hack90/virus_tiny')
df = pd.read_parquet('virus_ds.parquet')
virus = df['Organism_Name'].unique()
virus = {v: v for v in virus}
############################################################# Filter and Select ########################################################
def filter_and_select(group):
if len(group) >= 3:
return group.head(3)
############################################################# UI #################################################################
ui.page_opts(fillable=True)
with ui.navset_card_tab(id="tab"):
with ui.nav_panel("Viral Macrostructure"):
ui.panel_title("Do viruses have underlying structure?")
with ui.layout_columns():
with ui.card():
ui.input_selectize("virus_selector", "Select your viruses:", virus, multiple=True, selected=None)
with ui.card():
ui.input_selectize(
"plot_type_macro",
"Select your method:",
["Chaos Game Representation", "2D Line", "ColorSquare", "Persistant Homology", "Wens Method"],
multiple=False,
selected=None,
)
@render.plot()
def plot_macro():
df = pd.read_parquet("virus_ds.parquet")
df = df[df["Organism_Name"].isin(input.virus_selector())]
grouped = df.groupby("Organism_Name")["Sequence"].apply(list)
plot_type = input.plot_type_macro()
if plot_type == "2D Line":
return plot_2d_comparison(grouped, grouped.index)
elif plot_type == "ColorSquare":
filtered_df = df.groupby("Organism_Name").apply(filter_and_select).reset_index(drop=True)
return plot_color_square(filtered_df["Sequence"], filtered_df["Organism_Name"].unique())
elif plot_type == "Wens Method":
return wens_method_heatmap(df, df["Organism_Name"].unique())
elif plot_type == "Chaos Game Representation":
filtered_df = df.groupby("Organism_Name").apply(filter_and_select).reset_index(drop=True)
return plot_fcgr(filtered_df["Sequence"], df["Organism_Name"].unique())
elif plot_type == "Persistant Homology":
filtered_df = df.groupby("Organism_Name").apply(filter_and_select).reset_index(drop=True)
return plot_persistence_homology(filtered_df["Sequence"], filtered_df["Organism_Name"])
with ui.nav_panel("Viral Microstructure"):
ui.panel_title("Kmer Distribution")
with ui.layout_columns():
with ui.card():
ui.input_slider("kmer", "kmer", 0, 10, 4)
ui.input_slider("top_k", "top:", 0, 1000, 15)
ui.input_selectize("plot_type", "Select metric:", ["percentage", "count"], multiple=False, selected=None)
@render.plot()
def plot_micro():
df = pd.read_csv("kmers.csv")
k = input.kmer()
top_k = input.top_k()
plot_type = input.plot_type()
if k > 0:
df = df[df["k"] == k].head(top_k)
fig, ax = plt.subplots()
if plot_type == "count":
ax.bar(df["kmer"], df["count"])
ax.set_ylabel("Count")
elif plot_type == "percentage":
ax.bar(df["kmer"], df["percent"] * 100)
ax.set_ylabel("Percentage")
ax.set_title(f"Most common {k}-mers")
ax.set_xlabel("K-mer")
ax.set_xticklabels(df["kmer"], rotation=90)
return fig
with ui.nav_panel("Viral Model Training"):
ui.panel_title("Does context size matter for a nucleotide model?")
def plot_loss_rates(df, model_type):
x = np.linspace(0, 1, 1000)
loss_rates = []
labels = ["32", "64", "128", "256", "512", "1024"]
df = df.drop(columns=["Step"])
for col in df.columns:
y = df[col].dropna().astype("float", errors="ignore").values
f = interp1d(np.linspace(0, 1, len(y)), y)
loss_rates.append(f(x))
fig, ax = plt.subplots()
for i, loss_rate in enumerate(loss_rates):
ax.plot(x, loss_rate, label=labels[i])
ax.legend()
ax.set_title(f"Loss rates for a {model_type} parameter model across context windows")
ax.set_xlabel("Training steps")
ax.set_ylabel("Loss rate")
return fig
@render.image
def plot_context_size_scaling():
df = pd.read_csv("14m.csv")
fig = plot_loss_rates(df, "14M")
if fig:
import tempfile
fd, path = tempfile.mkstemp(suffix=".svg")
fig.savefig(path)
return {"src": str(path), "width": "600px", "format": "svg"}
with ui.nav_panel("Model loss analysis"):
ui.panel_title("Neurips stuff")
with ui.card():
ui.input_selectize(
"param_type",
"Select Param Type:",
["14", "31", "70", "160", "410"],
multiple=True,
selected=["14", "70"],
)
ui.input_selectize(
"model_type",
"Select Model Type:",
["pythia", "denseformer", "evo"],
multiple=True,
selected=["pythia", "denseformer"],
)
ui.input_selectize(
"loss_type",
"Select Loss Type:",
["compliment", "cross_entropy", "headless", "2d", "2d_representation_MSEPlusCE"],
multiple=True,
selected=["compliment", "cross_entropy", "headless"],
)
def plot_loss_rates_model(df, param_types, loss_types, model_types):
x = np.linspace(0, 1, 1000)
loss_rates = []
labels = []
for param_type in param_types:
for loss_type in loss_types:
for model_type in model_types:
y = df[
(df["param_type"] == int(param_type))
& (df["loss_type"] == loss_type)
& (df["model_type"] == model_type)
]["loss_interp"].values
if len(y) > 0:
f = interp1d(np.linspace(0, 1, len(y)), y)
loss_rates.append(f(x))
labels.append(f"{param_type}_{loss_type}_{model_type}")
fig, ax = plt.subplots()
for i, loss_rate in enumerate(loss_rates):
ax.plot(x, loss_rate, label=labels[i])
ax.legend()
ax.set_xlabel("Training steps")
ax.set_ylabel("Loss rate")
return fig
@render.image
def plot_model_scaling():
df = pd.read_csv("training_data_5.csv")
df = df[df["epoch_interp"] > 0.035]
fig = plot_loss_rates_model(
df, input.param_type(), input.loss_type(), input.model_type()
)
if fig:
import tempfile
fd, path = tempfile.mkstemp(suffix=".svg")
fig.savefig(path)
return {"src": str(path), "width": "600px", "format": "svg"}
with ui.nav_panel("Scaling Laws"):
ui.panel_title("Params & Losses")
with ui.card():
ui.input_selectize(
"model_type_scale",
"Select Model Type:",
["pythia", "denseformer", "evo"],
multiple=True,
selected=["evo", "denseformer"],
)
ui.input_selectize(
"loss_type_scale",
"Select Loss Type:",
["compliment", "cross_entropy", "headless", "2d", "2d_representation_MSEPlusCE"],
multiple=True,
selected=["cross_entropy"],
)
def plot_loss_rates_model_scale(df, loss_type, model_types):
df = df[df["loss_type"] == loss_type[0]]
params = []
loss_rates = []
labels = []
for model_type in model_types:
df_new = df[df["model_type"] == model_type]
losses = []
params_model = []
for paramy in df_new["num_params"].unique():
loss = df_new[df_new["num_params"] == paramy]["loss_interp"].min()
par = int(paramy)
losses.append(loss)
params_model.append(par)
df_reorder = pd.DataFrame({"loss": losses, "params": params_model})
df_reorder = df_reorder.sort_values(by="params")
loss_rates.append(df_reorder["loss"].to_list())
params.append(df_reorder["params"].to_list())
labels.append(model_type)
fig, ax = plt.subplots()
for i, loss_rate in enumerate(loss_rates):
ax.plot(params[i], loss_rate, label=labels[i])
ax.legend()
ax.set_xlabel("Params")
ax.set_ylabel("Loss")
return fig
@render.image
def plot_big_boy_model():
df = pd.read_csv("training_data_5.csv")
fig = plot_loss_rates_model_scale(
df, input.loss_type_scale(), input.model_type_scale()
)
if fig:
import tempfile
fd, path = tempfile.mkstemp(suffix=".svg")
fig.savefig(path)
return {"src": str(path), "width": "600px", "format": "svg"}