Spaces:

InstaDeepAI
/

ntv3_benchmark

Running

App Files Files Community

MidAtBest commited on Dec 12, 2025

Commit

e9035bc

1 Parent(s): 9fc7bf0

feat: add multiple example plots

Browse files

Files changed (3) hide show

README.md +1 -8
data/bigwig_dataset.csv +0 -0
src/streamlit_app.py +538 -6

README.md CHANGED Viewed

@@ -10,11 +10,4 @@ tags:
 pinned: false
 short_description: NTv3 Benchmark
 license: apache-2.0
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 pinned: false
 short_description: NTv3 Benchmark
 license: apache-2.0
+---

data/bigwig_dataset.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

src/streamlit_app.py CHANGED Viewed

@@ -4,6 +4,8 @@ import os
 import pandas as pd
 import streamlit as st
 import plotly.express as px
 # ---------------------------------------------------------------------
 # Page config (must be the first Streamlit command)
@@ -68,6 +70,15 @@ MODEL_COLORS = {
 MODEL_NAMES = list(MODEL_COLORS.keys())
 _LAST_UPDATED = "Dec 10, 2025"
 _INTRO = """
 Benchmark across gene annotation and functionnal tracks.
@@ -181,12 +192,12 @@ _ALL_ASSAYS = (
 _ALL_MODELS = MODEL_NAMES[:]
 _BENCHMARKS = {
-    "Pearson correlations (multi-assay)": {
         "df": _PEARSON_DF,
         "metric_label": "Pearson correlation",
         "has_assay_type": True,
     },
-    "MCC (bed tracks)": {
         "df": _MCC_DF,
         "metric_label": "MCC",
         "has_assay_type": False,
@@ -260,6 +271,10 @@ def build_leaderboard(
     agg = agg.sort_values("Mean score", ascending=False).reset_index(drop=True)
     agg = agg[["Model", "Num entries", "Mean score"]]
     return agg
@@ -276,6 +291,303 @@ def build_bar_df(
     )
 # ---------------------------------------------------------------------
 # UI helpers
 # ---------------------------------------------------------------------
@@ -318,12 +630,30 @@ def main():
     # Species toggles, but only for species present in this benchmark
     st.sidebar.subheader("Species")
-    available_species = sorted(df_bench["species"].unique())
     selected_species: List[str] = []
     for sp in available_species:
-        if sidebar_toggle(sp, value=True, key=f"species_{benchmark_name}_{sp}"):
             selected_species.append(sp)
     # Assay toggles (Pearson only), based on filtered species
     if cfg.get("has_assay_type", False):
         st.sidebar.subheader("Assay types")
@@ -343,9 +673,10 @@ def main():
     else:
         selected_assays = []
     # Bed track / dataset toggles (MCC only), based on species selection
     selected_datasets: List[str] = []
-    if benchmark_name == "MCC":
         st.sidebar.subheader("Genome annotations")
         if selected_species:
             df_for_tracks = df_bench[df_bench["species"].isin(selected_species)]
@@ -377,33 +708,234 @@ def main():
     with col1:
         st.subheader("🏅 Leaderboard (per model)")
         if leaderboard_df.empty:
             st.info("No data for the selected filters.")
         else:
             st.dataframe(leaderboard_df, use_container_width=True)
     with col2:
         st.subheader("📈 Mean score per model")
         if bar_df.empty:
             st.info("No data for the selected filters.")
         else:
             fig = px.bar(
                 bar_df,
                 x="Model",
                 y="Mean score",
                 color="Model",
                 color_discrete_map=MODEL_COLORS,
             )
             fig.update_layout(
                 barmode="group",
                 height=500,
-                xaxis_title="Model",
                 yaxis_title="Mean score",
                 plot_bgcolor="rgba(0,0,0,0)",
                 paper_bgcolor="rgba(0,0,0,0)",
             )
             st.plotly_chart(fig, use_container_width=True)
 if __name__ == "__main__":
     main()

 import pandas as pd
 import streamlit as st
 import plotly.express as px
+from plotly.subplots import make_subplots
+import plotly.graph_objects as go
 # ---------------------------------------------------------------------
 # Page config (must be the first Streamlit command)
 MODEL_NAMES = list(MODEL_COLORS.keys())
+PLANT_SPECIES = ["tomato", "rice", "maize", "arabidopsis"]
+ANIMAL_SPECIES = ["human", "chicken", "cattle"]
+SPECIES_GROUPS = {
+    "Plants": PLANT_SPECIES,
+    "Animals": ANIMAL_SPECIES,   # (your code calls these HUMAN_SPECIES, but they’re the “animal” set)
+}
 _LAST_UPDATED = "Dec 10, 2025"
 _INTRO = """
 Benchmark across gene annotation and functionnal tracks.
 _ALL_MODELS = MODEL_NAMES[:]
 _BENCHMARKS = {
+    "Functional Tracks": {
         "df": _PEARSON_DF,
         "metric_label": "Pearson correlation",
         "has_assay_type": True,
     },
+    "Genome Annotation": {
         "df": _MCC_DF,
         "metric_label": "MCC",
         "has_assay_type": False,
     agg = agg.sort_values("Mean score", ascending=False).reset_index(drop=True)
     agg = agg[["Model", "Num entries", "Mean score"]]
+    # Ensure the index starts with 1
+    agg.index += 1
     return agg
     )
+def build_category_model_df(
+    benchmark_name: str,
+    selected_species: List[str],
+    selected_assays: List[str],
+    selected_models: List[str],
+    selected_datasets: List[str],
+) -> pd.DataFrame:
+    """
+    Mean score per (category, Model) after applying the same filters.
+    Category = assay_type (Functional Tracks) or datasets (Genome Annotation).
+    """
+    cfg = _BENCHMARKS[benchmark_name]
+    df = filter_base_df(
+        benchmark_name,
+        selected_species,
+        selected_assays,
+        selected_models,
+        selected_datasets,
+    )
+    if df.empty:
+        return pd.DataFrame(columns=["Category", "Model", "Mean score"])
+    # Pick the right breakdown column
+    if cfg.get("has_assay_type", False) and "assay_type" in df.columns:
+        category_col = "assay_type"
+        category_label = "Assay type"
+    else:
+        category_col = "datasets"
+        category_label = "Dataset"
+    if category_col not in df.columns:
+        return pd.DataFrame(columns=["Category", "Model", "Mean score"])
+    out = (
+        df.groupby([category_col, "Model"], as_index=False)["Score"]
+        .mean()
+        .rename(columns={category_col: "Category", "Score": "Mean score"})
+    )
+    out["Mean score"] = out["Mean score"].round(3)
+    out.attrs["category_label"] = category_label  # for nicer axis title
+    return out
+def plot_breakdown_facets_sorted_models(
+    breakdown_df: pd.DataFrame,
+    metric_label: str,
+    height: int = 420,
+):
+    categories = list(breakdown_df["Category"].dropna().unique())
+    categories = sorted(categories)
+    n = len(categories)
+    if n == 0:
+        return None
+    rows = 1
+    cols = n  # 👈 everything in one row
+    # Global y-range (consistent scale)
+    y_min = breakdown_df["Mean score"].min()
+    y_max = breakdown_df["Mean score"].max()
+    pad = 0.05 * (y_max - y_min if y_max > y_min else 1.0)
+    y_range = [y_min - pad, y_max + pad]
+    fig = make_subplots(
+        rows=rows,
+        cols=cols,
+        subplot_titles=categories,
+        shared_yaxes=True,
+        horizontal_spacing=0.04,  # tighter spacing
+    )
+    for i, cat in enumerate(categories):
+        r = (i // cols) + 1
+        c = (i % cols) + 1
+        sub = (
+            breakdown_df[breakdown_df["Category"] == cat]
+            .sort_values("Mean score", ascending=True)
+        )
+        fig.add_trace(
+            go.Bar(
+                x=sub["Model"],
+                y=sub["Mean score"],
+                marker_color=[MODEL_COLORS.get(m, "#808080") for m in sub["Model"]],
+                showlegend=False,
+            ),
+            row=r,
+            col=c,
+        )
+        fig.update_xaxes(showticklabels=False, title_text="", row=r, col=c)
+        fig.update_yaxes(range=y_range, title_text="", row=r, col=c)  # 👈 apply range
+    fig.update_layout(
+        height=height,
+        plot_bgcolor="rgba(0,0,0,0)",
+        paper_bgcolor="rgba(0,0,0,0)",
+        margin=dict(t=60, l=10, r=10, b=10),
+    )
+    # Single y-axis label on the leftmost panel
+    fig.update_yaxes(title_text=metric_label, row=1, col=1)
+    return fig
+def build_radar_df(
+    benchmark_name: str,
+    selected_species: List[str],
+    selected_assays: List[str],
+    selected_models: List[str],
+    selected_datasets: List[str],
+) -> pd.DataFrame:
+    cfg = _BENCHMARKS[benchmark_name]
+    df = filter_base_df(
+        benchmark_name,
+        selected_species,
+        selected_assays,
+        selected_models,
+        selected_datasets,
+    )
+    if df.empty:
+        return pd.DataFrame()
+    # Choose axis column
+    if cfg.get("has_assay_type", False) and "assay_type" in df.columns:
+        axis_col = "assay_type"
+        axis_label = "Assay type"
+    else:
+        axis_col = "datasets"
+        axis_label = "Dataset"
+    radar_df = (
+        df.groupby([axis_col, "Model"], as_index=False)["Score"]
+        .mean()
+        .rename(columns={axis_col: "Axis", "Score": "Value"})
+    )
+    radar_df.attrs["axis_label"] = axis_label
+    return radar_df
+def build_pairwise_scatter_df(
+    benchmark_name: str,
+    selected_species: List[str],
+    selected_assays: List[str],
+    selected_models: List[str],
+    selected_datasets: List[str],
+    model_a: str,
+    model_b: str,
+) -> pd.DataFrame:
+    """
+    Returns a per-track dataframe with columns:
+      Track, Model A, Model B, (optional) species, (optional) assay_type, datasets
+    Where each row corresponds to a specific track (datasets [+ assay_type]).
+    """
+    cfg = _BENCHMARKS[benchmark_name]
+    # Filter using the same UI toggles, but ensure the chosen models are included
+    models_for_filter = list(set(selected_models + [model_a, model_b])) if selected_models else [model_a, model_b]
+    df = filter_base_df(
+        benchmark_name,
+        selected_species,
+        selected_assays,
+        models_for_filter,
+        selected_datasets,
+    )
+    if df.empty:
+        return pd.DataFrame()
+    # Define what “a specific track” means
+    track_cols = ["datasets"]
+    if cfg.get("has_assay_type", False) and "assay_type" in df.columns:
+        track_cols = ["assay_type", "datasets"]
+    # (Optional) keep species in hover if multiple are selected
+    keep_species = "species" in df.columns and (selected_species is None or len(selected_species) != 1)
+    id_cols = (["species"] if keep_species else []) + track_cols
+    # Pivot into two model columns
+    wide = (
+        df[df["Model"].isin([model_a, model_b])]
+        .pivot_table(index=id_cols, columns="Model", values="Score", aggfunc="mean")
+        .reset_index()
+    )
+    # Require both values to exist for a dot
+    if model_a not in wide.columns or model_b not in wide.columns:
+        return pd.DataFrame()
+    wide = wide.dropna(subset=[model_a, model_b])
+    # Nice “Track” label for display
+    if "assay_type" in wide.columns:
+        wide["Track"] = wide["assay_type"].astype(str) + " / " + wide["datasets"].astype(str)
+    else:
+        wide["Track"] = wide["datasets"].astype(str)
+    # Rename for plotting
+    wide = wide.rename(columns={model_a: "Model A", model_b: "Model B"})
+    return wide
+def build_violin_df(
+    benchmark_name: str,
+    selected_species: List[str],
+    selected_assays: List[str],
+    selected_models: List[str],
+    selected_datasets: List[str],
+) -> pd.DataFrame:
+    # Use the same base filtering, but keep all per-track rows
+    df = filter_base_df(
+        benchmark_name,
+        selected_species,
+        selected_assays,
+        selected_models,
+        selected_datasets,
+    )
+    # Keep only needed columns
+    keep = ["Model", "Score"]
+    for col in ["species", "assay_type", "datasets"]:
+        if col in df.columns:
+            keep.append(col)
+    return df[keep].copy()
+def plot_radar(
+    radar_df: pd.DataFrame,
+    metric_label: str,
+    height: int = 600,
+):
+    if radar_df.empty:
+        return None
+    axes = radar_df["Axis"].unique().tolist()
+    # Global radial range
+    r_min = radar_df["Value"].min()
+    r_max = radar_df["Value"].max()
+    pad = 0.05 * (r_max - r_min if r_max > r_min else 1.0)
+    r_range = [r_min - pad, r_max + pad]
+    fig = go.Figure()
+    for model in radar_df["Model"].unique():
+        sub = radar_df[radar_df["Model"] == model]
+        # Ensure consistent axis ordering
+        sub = sub.set_index("Axis").reindex(axes)
+        fig.add_trace(
+            go.Scatterpolar(
+                r=sub["Value"],
+                theta=axes,
+                fill="toself",
+                name=model,
+                line_color=MODEL_COLORS.get(model),
+                opacity=0.75,
+            )
+        )
+    fig.update_layout(
+        height=height,
+        polar=dict(
+            bgcolor="rgba(0,0,0,0)",          # 👈 polar background
+            radialaxis=dict(
+                title=metric_label,
+                range=r_range,
+                tickformat=".2f",
+                showgrid=True,
+                gridcolor="rgba(0,0,0,0.15)", # subtle grid
+            ),
+            angularaxis=dict(
+                showgrid=True,
+                gridcolor="rgba(0,0,0,0.15)",
+            ),
+        ),
+        paper_bgcolor="rgba(0,0,0,0)",        # 👈 entire figure background
+        plot_bgcolor="rgba(0,0,0,0)",         # 👈 plot area
+        showlegend=True,
+        legend_title_text="Model",
+        margin=dict(t=40, b=40, l=40, r=40),
+    )
+    return fig
 # ---------------------------------------------------------------------
 # UI helpers
 # ---------------------------------------------------------------------
     # Species toggles, but only for species present in this benchmark
     st.sidebar.subheader("Species")
+    # Toggle: Plants vs Animals
+    species_group = st.sidebar.radio(
+        "Group",
+        options=["Animals", "Plants"],
+        index=0,
+        horizontal=True,
+        key=f"species_group_{benchmark_name}",
+    )
+    available_species_all = sorted(df_bench["species"].unique())
+    allowed_species = set(SPECIES_GROUPS[species_group]).intersection(available_species_all)
+    available_species = sorted(allowed_species)
     selected_species: List[str] = []
     for sp in available_species:
+        if sidebar_toggle(sp, value=True, key=f"species_{benchmark_name}_{species_group}_{sp}"):
             selected_species.append(sp)
+    # (Optional) If no species exist for that group in this benchmark
+    if not available_species:
+        st.sidebar.info(f"No {species_group.lower()} species available for this benchmark.")
     # Assay toggles (Pearson only), based on filtered species
     if cfg.get("has_assay_type", False):
         st.sidebar.subheader("Assay types")
     else:
         selected_assays = []
     # Bed track / dataset toggles (MCC only), based on species selection
     selected_datasets: List[str] = []
+    if benchmark_name == "Genome Annotation":
         st.sidebar.subheader("Genome annotations")
         if selected_species:
             df_for_tracks = df_bench[df_bench["species"].isin(selected_species)]
     with col1:
         st.subheader("🏅 Leaderboard (per model)")
+        st.write("\n") # 👈 spacer to match plotly padding
+        st.write("\n")
+        st.write("\n")
         if leaderboard_df.empty:
             st.info("No data for the selected filters.")
         else:
             st.dataframe(leaderboard_df, use_container_width=True)
     with col2:
         st.subheader("📈 Mean score per model")
         if bar_df.empty:
             st.info("No data for the selected filters.")
         else:
+            # Order models by performance (least -> most)
+            bar_df = bar_df.sort_values("Mean score", ascending=True)
+            model_order = bar_df["Model"].tolist()
             fig = px.bar(
                 bar_df,
                 x="Model",
                 y="Mean score",
                 color="Model",
                 color_discrete_map=MODEL_COLORS,
+                category_orders={"Model": model_order},  # enforce ordering on x
             )
             fig.update_layout(
                 barmode="group",
                 height=500,
+                xaxis_title="",
                 yaxis_title="Mean score",
                 plot_bgcolor="rgba(0,0,0,0)",
                 paper_bgcolor="rgba(0,0,0,0)",
+                bargap=0.08,
             )
+            # Hide x-axis model names (same style as the panels)
+            fig.update_xaxes(showticklabels=False)
             st.plotly_chart(fig, use_container_width=True)
+    # --- Breakdown plot: assay_type (Functional Tracks) OR datasets (Genome Annotation) ---
+    breakdown_df = build_category_model_df(
+        benchmark_name, selected_species, selected_assays, selected_models, selected_datasets
+    )
+    st.subheader("🧪 Mean score by assay type / dataset (all models)")
+    if breakdown_df.empty:
+        st.info("No data for the selected filters.")
+    else:
+        fig_breakdown = plot_breakdown_facets_sorted_models(
+            breakdown_df,
+            metric_label=cfg["metric_label"],
+            height=300,
+        )
+        st.plotly_chart(fig_breakdown, use_container_width=True)
+    st.subheader("🕸️ Performance by assay type / dataset (radar)")
+    radar_df = build_radar_df(
+        benchmark_name,
+        selected_species,
+        selected_assays,
+        selected_models,
+        selected_datasets,
+    )
+    if radar_df.empty:
+        st.info("No data for the selected filters.")
+    else:
+        fig_radar = plot_radar(
+            radar_df,
+            metric_label=cfg["metric_label"],
+        )
+        st.plotly_chart(fig_radar, use_container_width=True)
+    st.subheader("⚖️ Model comparison")
+    left, right = st.columns([1, 1], gap="large")
+    with left:
+        st.markdown("#### Head-to-head (per track)")
+        model_picker_options = selected_models if selected_models else _ALL_MODELS
+        default_a = model_picker_options[0] if model_picker_options else _ALL_MODELS[0]
+        default_b = model_picker_options[1] if len(model_picker_options) > 1 else (
+            _ALL_MODELS[1] if len(_ALL_MODELS) > 1 else default_a
+        )
+        cA, cB = st.columns([1, 1])
+        with cA:
+            model_a = st.selectbox(
+                "Model A (y-axis)",
+                options=model_picker_options,
+                index=model_picker_options.index(default_a) if default_a in model_picker_options else 0,
+                key=f"pair_model_a_{benchmark_name}",
+            )
+        with cB:
+            b_options = [m for m in model_picker_options if m != model_a] or model_picker_options
+            model_b = st.selectbox(
+                "Model B (x-axis)",
+                options=b_options,
+                index=0,
+                key=f"pair_model_b_{benchmark_name}",
+            )
+        scatter_df = build_pairwise_scatter_df(
+            benchmark_name,
+            selected_species,
+            selected_assays,
+            selected_models,
+            selected_datasets,
+            model_a,
+            model_b,
+        )
+        if scatter_df.empty:
+            st.info("No overlapping tracks for the selected filters (or one model is missing values).")
+        else:
+            min_v = float(min(scatter_df["Model A"].min(), scatter_df["Model B"].min()))
+            max_v = float(max(scatter_df["Model A"].max(), scatter_df["Model B"].max()))
+            pad = 0.05 * (max_v - min_v if max_v > min_v else 1.0)
+            axis_range = [min_v - pad, max_v + pad]
+            tick_step = (axis_range[1] - axis_range[0]) / 5
+            hover_cols = ["Track"]
+            for c in ["species", "assay_type", "datasets"]:
+                if c in scatter_df.columns:
+                    hover_cols.append(c)
+            # Model A on Y, Model B on X
+            fig_scatter = px.scatter(
+                scatter_df,
+                x="Model B",
+                y="Model A",
+                hover_name="Track",
+                hover_data=hover_cols,
+            )
+            # Red diagonal y=x
+            fig_scatter.add_shape(
+                type="line",
+                x0=axis_range[0], y0=axis_range[0],
+                x1=axis_range[1], y1=axis_range[1],
+                xref="x", yref="y",
+                line=dict(color="red", dash="dot", width=2),
+            )
+            # Square + identical scale/ticks (works even with use_container_width=True)
+            fig_scatter.update_layout(
+                height=550,
+                margin=dict(l=60, r=20, t=20, b=60),
+                xaxis=dict(
+                    title=f"{model_b} — {cfg['metric_label']}",
+                    range=axis_range,
+                    dtick=tick_step,
+                    constrain="domain",
+                ),
+                yaxis=dict(
+                    title=f"{model_a} — {cfg['metric_label']}",
+                    range=axis_range,
+                    dtick=tick_step,
+                    scaleanchor="x",   # lock y to x
+                    scaleratio=1,
+                    constrain="domain",
+                ),
+                plot_bgcolor="rgba(0,0,0,0)",
+                paper_bgcolor="rgba(0,0,0,0)",
+            )
+            st.plotly_chart(fig_scatter, use_container_width=True)
+    with right:
+        st.markdown("#### All models (distribution across tracks)")
+        violin_df = build_violin_df(
+            benchmark_name,
+            selected_species,
+            selected_assays,
+            selected_models,
+            selected_datasets,
+        )
+        if violin_df.empty:
+            st.info("No data for the selected filters.")
+        else:
+                # Order models by median performance (least -> most)
+            model_order = (
+                violin_df
+                .groupby("Model")["Score"]
+                .median()
+                .sort_values(ascending=True)
+                .index
+                .tolist()
+            )
+            fig_violin = px.violin(
+                violin_df,
+                x="Model",
+                y="Score",
+                color="Model",
+                color_discrete_map=MODEL_COLORS,
+                box=True,            # keep inner boxplot
+                points=False,        # 👈 remove all dots
+                category_orders={"Model": model_order},  # 👈 enforce ordering
+            )
+            fig_violin.update_layout(
+                height=650,
+                xaxis_title="",
+                yaxis_title=cfg["metric_label"],
+                plot_bgcolor="rgba(0,0,0,0)",
+                paper_bgcolor="rgba(0,0,0,0)",
+                showlegend=False,
+            )
+            fig_violin.update_traces(
+                box_visible=True,
+                meanline_visible=False,
+            )
+            # Optional: hide model names if you prefer a cleaner look
+            # fig_violin.update_xaxes(showticklabels=False)
+            st.plotly_chart(fig_violin, use_container_width=True)
 if __name__ == "__main__":
     main()