MidAtBest commited on
Commit
e9035bc
ยท
1 Parent(s): 9fc7bf0

feat: add multiple example plots

Browse files
Files changed (3) hide show
  1. README.md +1 -8
  2. data/bigwig_dataset.csv +0 -0
  3. src/streamlit_app.py +538 -6
README.md CHANGED
@@ -10,11 +10,4 @@ tags:
10
  pinned: false
11
  short_description: NTv3 Benchmark
12
  license: apache-2.0
13
- ---
14
-
15
- # Welcome to Streamlit!
16
-
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
-
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
10
  pinned: false
11
  short_description: NTv3 Benchmark
12
  license: apache-2.0
13
+ ---
 
 
 
 
 
 
 
data/bigwig_dataset.csv CHANGED
The diff for this file is too large to render. See raw diff
 
src/streamlit_app.py CHANGED
@@ -4,6 +4,8 @@ import os
4
  import pandas as pd
5
  import streamlit as st
6
  import plotly.express as px
 
 
7
 
8
  # ---------------------------------------------------------------------
9
  # Page config (must be the first Streamlit command)
@@ -68,6 +70,15 @@ MODEL_COLORS = {
68
 
69
  MODEL_NAMES = list(MODEL_COLORS.keys())
70
 
 
 
 
 
 
 
 
 
 
71
  _LAST_UPDATED = "Dec 10, 2025"
72
  _INTRO = """
73
  Benchmark across gene annotation and functionnal tracks.
@@ -181,12 +192,12 @@ _ALL_ASSAYS = (
181
  _ALL_MODELS = MODEL_NAMES[:]
182
 
183
  _BENCHMARKS = {
184
- "Pearson correlations (multi-assay)": {
185
  "df": _PEARSON_DF,
186
  "metric_label": "Pearson correlation",
187
  "has_assay_type": True,
188
  },
189
- "MCC (bed tracks)": {
190
  "df": _MCC_DF,
191
  "metric_label": "MCC",
192
  "has_assay_type": False,
@@ -260,6 +271,10 @@ def build_leaderboard(
260
 
261
  agg = agg.sort_values("Mean score", ascending=False).reset_index(drop=True)
262
  agg = agg[["Model", "Num entries", "Mean score"]]
 
 
 
 
263
  return agg
264
 
265
 
@@ -276,6 +291,303 @@ def build_bar_df(
276
  )
277
 
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  # ---------------------------------------------------------------------
280
  # UI helpers
281
  # ---------------------------------------------------------------------
@@ -318,12 +630,30 @@ def main():
318
 
319
  # Species toggles, but only for species present in this benchmark
320
  st.sidebar.subheader("Species")
321
- available_species = sorted(df_bench["species"].unique())
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  selected_species: List[str] = []
323
  for sp in available_species:
324
- if sidebar_toggle(sp, value=True, key=f"species_{benchmark_name}_{sp}"):
325
  selected_species.append(sp)
326
 
 
 
 
 
 
327
  # Assay toggles (Pearson only), based on filtered species
328
  if cfg.get("has_assay_type", False):
329
  st.sidebar.subheader("Assay types")
@@ -343,9 +673,10 @@ def main():
343
  else:
344
  selected_assays = []
345
 
 
346
  # Bed track / dataset toggles (MCC only), based on species selection
347
  selected_datasets: List[str] = []
348
- if benchmark_name == "MCC":
349
  st.sidebar.subheader("Genome annotations")
350
  if selected_species:
351
  df_for_tracks = df_bench[df_bench["species"].isin(selected_species)]
@@ -377,33 +708,234 @@ def main():
377
 
378
  with col1:
379
  st.subheader("๐Ÿ… Leaderboard (per model)")
 
 
 
380
  if leaderboard_df.empty:
381
  st.info("No data for the selected filters.")
382
  else:
383
  st.dataframe(leaderboard_df, use_container_width=True)
384
 
 
385
  with col2:
386
  st.subheader("๐Ÿ“ˆ Mean score per model")
387
  if bar_df.empty:
388
  st.info("No data for the selected filters.")
389
  else:
 
 
 
 
 
390
  fig = px.bar(
391
  bar_df,
392
  x="Model",
393
  y="Mean score",
394
  color="Model",
395
  color_discrete_map=MODEL_COLORS,
 
396
  )
397
  fig.update_layout(
398
  barmode="group",
399
  height=500,
400
- xaxis_title="Model",
401
  yaxis_title="Mean score",
402
  plot_bgcolor="rgba(0,0,0,0)",
403
  paper_bgcolor="rgba(0,0,0,0)",
 
404
  )
 
 
 
 
405
  st.plotly_chart(fig, use_container_width=True)
406
 
407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  if __name__ == "__main__":
409
  main()
 
4
  import pandas as pd
5
  import streamlit as st
6
  import plotly.express as px
7
+ from plotly.subplots import make_subplots
8
+ import plotly.graph_objects as go
9
 
10
  # ---------------------------------------------------------------------
11
  # Page config (must be the first Streamlit command)
 
70
 
71
  MODEL_NAMES = list(MODEL_COLORS.keys())
72
 
73
+ PLANT_SPECIES = ["tomato", "rice", "maize", "arabidopsis"]
74
+ ANIMAL_SPECIES = ["human", "chicken", "cattle"]
75
+
76
+ SPECIES_GROUPS = {
77
+ "Plants": PLANT_SPECIES,
78
+ "Animals": ANIMAL_SPECIES, # (your code calls these HUMAN_SPECIES, but theyโ€™re the โ€œanimalโ€ set)
79
+ }
80
+
81
+
82
  _LAST_UPDATED = "Dec 10, 2025"
83
  _INTRO = """
84
  Benchmark across gene annotation and functionnal tracks.
 
192
  _ALL_MODELS = MODEL_NAMES[:]
193
 
194
  _BENCHMARKS = {
195
+ "Functional Tracks": {
196
  "df": _PEARSON_DF,
197
  "metric_label": "Pearson correlation",
198
  "has_assay_type": True,
199
  },
200
+ "Genome Annotation": {
201
  "df": _MCC_DF,
202
  "metric_label": "MCC",
203
  "has_assay_type": False,
 
271
 
272
  agg = agg.sort_values("Mean score", ascending=False).reset_index(drop=True)
273
  agg = agg[["Model", "Num entries", "Mean score"]]
274
+
275
+ # Ensure the index starts with 1
276
+ agg.index += 1
277
+
278
  return agg
279
 
280
 
 
291
  )
292
 
293
 
294
+ def build_category_model_df(
295
+ benchmark_name: str,
296
+ selected_species: List[str],
297
+ selected_assays: List[str],
298
+ selected_models: List[str],
299
+ selected_datasets: List[str],
300
+ ) -> pd.DataFrame:
301
+ """
302
+ Mean score per (category, Model) after applying the same filters.
303
+ Category = assay_type (Functional Tracks) or datasets (Genome Annotation).
304
+ """
305
+ cfg = _BENCHMARKS[benchmark_name]
306
+ df = filter_base_df(
307
+ benchmark_name,
308
+ selected_species,
309
+ selected_assays,
310
+ selected_models,
311
+ selected_datasets,
312
+ )
313
+ if df.empty:
314
+ return pd.DataFrame(columns=["Category", "Model", "Mean score"])
315
+
316
+ # Pick the right breakdown column
317
+ if cfg.get("has_assay_type", False) and "assay_type" in df.columns:
318
+ category_col = "assay_type"
319
+ category_label = "Assay type"
320
+ else:
321
+ category_col = "datasets"
322
+ category_label = "Dataset"
323
+
324
+ if category_col not in df.columns:
325
+ return pd.DataFrame(columns=["Category", "Model", "Mean score"])
326
+
327
+ out = (
328
+ df.groupby([category_col, "Model"], as_index=False)["Score"]
329
+ .mean()
330
+ .rename(columns={category_col: "Category", "Score": "Mean score"})
331
+ )
332
+ out["Mean score"] = out["Mean score"].round(3)
333
+ out.attrs["category_label"] = category_label # for nicer axis title
334
+ return out
335
+
336
+
337
+ def plot_breakdown_facets_sorted_models(
338
+ breakdown_df: pd.DataFrame,
339
+ metric_label: str,
340
+ height: int = 420,
341
+ ):
342
+ categories = list(breakdown_df["Category"].dropna().unique())
343
+ categories = sorted(categories)
344
+
345
+ n = len(categories)
346
+ if n == 0:
347
+ return None
348
+
349
+ rows = 1
350
+ cols = n # ๐Ÿ‘ˆ everything in one row
351
+
352
+ # Global y-range (consistent scale)
353
+ y_min = breakdown_df["Mean score"].min()
354
+ y_max = breakdown_df["Mean score"].max()
355
+ pad = 0.05 * (y_max - y_min if y_max > y_min else 1.0)
356
+ y_range = [y_min - pad, y_max + pad]
357
+
358
+ fig = make_subplots(
359
+ rows=rows,
360
+ cols=cols,
361
+ subplot_titles=categories,
362
+ shared_yaxes=True,
363
+ horizontal_spacing=0.04, # tighter spacing
364
+ )
365
+
366
+ for i, cat in enumerate(categories):
367
+ r = (i // cols) + 1
368
+ c = (i % cols) + 1
369
+
370
+ sub = (
371
+ breakdown_df[breakdown_df["Category"] == cat]
372
+ .sort_values("Mean score", ascending=True)
373
+ )
374
+
375
+ fig.add_trace(
376
+ go.Bar(
377
+ x=sub["Model"],
378
+ y=sub["Mean score"],
379
+ marker_color=[MODEL_COLORS.get(m, "#808080") for m in sub["Model"]],
380
+ showlegend=False,
381
+ ),
382
+ row=r,
383
+ col=c,
384
+ )
385
+
386
+ fig.update_xaxes(showticklabels=False, title_text="", row=r, col=c)
387
+ fig.update_yaxes(range=y_range, title_text="", row=r, col=c) # ๐Ÿ‘ˆ apply range
388
+
389
+ fig.update_layout(
390
+ height=height,
391
+ plot_bgcolor="rgba(0,0,0,0)",
392
+ paper_bgcolor="rgba(0,0,0,0)",
393
+ margin=dict(t=60, l=10, r=10, b=10),
394
+ )
395
+
396
+ # Single y-axis label on the leftmost panel
397
+ fig.update_yaxes(title_text=metric_label, row=1, col=1)
398
+
399
+ return fig
400
+
401
+
402
+ def build_radar_df(
403
+ benchmark_name: str,
404
+ selected_species: List[str],
405
+ selected_assays: List[str],
406
+ selected_models: List[str],
407
+ selected_datasets: List[str],
408
+ ) -> pd.DataFrame:
409
+ cfg = _BENCHMARKS[benchmark_name]
410
+
411
+ df = filter_base_df(
412
+ benchmark_name,
413
+ selected_species,
414
+ selected_assays,
415
+ selected_models,
416
+ selected_datasets,
417
+ )
418
+
419
+ if df.empty:
420
+ return pd.DataFrame()
421
+
422
+ # Choose axis column
423
+ if cfg.get("has_assay_type", False) and "assay_type" in df.columns:
424
+ axis_col = "assay_type"
425
+ axis_label = "Assay type"
426
+ else:
427
+ axis_col = "datasets"
428
+ axis_label = "Dataset"
429
+
430
+ radar_df = (
431
+ df.groupby([axis_col, "Model"], as_index=False)["Score"]
432
+ .mean()
433
+ .rename(columns={axis_col: "Axis", "Score": "Value"})
434
+ )
435
+
436
+ radar_df.attrs["axis_label"] = axis_label
437
+ return radar_df
438
+
439
+
440
+ def build_pairwise_scatter_df(
441
+ benchmark_name: str,
442
+ selected_species: List[str],
443
+ selected_assays: List[str],
444
+ selected_models: List[str],
445
+ selected_datasets: List[str],
446
+ model_a: str,
447
+ model_b: str,
448
+ ) -> pd.DataFrame:
449
+ """
450
+ Returns a per-track dataframe with columns:
451
+ Track, Model A, Model B, (optional) species, (optional) assay_type, datasets
452
+ Where each row corresponds to a specific track (datasets [+ assay_type]).
453
+ """
454
+ cfg = _BENCHMARKS[benchmark_name]
455
+
456
+ # Filter using the same UI toggles, but ensure the chosen models are included
457
+ models_for_filter = list(set(selected_models + [model_a, model_b])) if selected_models else [model_a, model_b]
458
+
459
+ df = filter_base_df(
460
+ benchmark_name,
461
+ selected_species,
462
+ selected_assays,
463
+ models_for_filter,
464
+ selected_datasets,
465
+ )
466
+
467
+ if df.empty:
468
+ return pd.DataFrame()
469
+
470
+ # Define what โ€œa specific trackโ€ means
471
+ track_cols = ["datasets"]
472
+ if cfg.get("has_assay_type", False) and "assay_type" in df.columns:
473
+ track_cols = ["assay_type", "datasets"]
474
+
475
+ # (Optional) keep species in hover if multiple are selected
476
+ keep_species = "species" in df.columns and (selected_species is None or len(selected_species) != 1)
477
+
478
+ id_cols = (["species"] if keep_species else []) + track_cols
479
+
480
+ # Pivot into two model columns
481
+ wide = (
482
+ df[df["Model"].isin([model_a, model_b])]
483
+ .pivot_table(index=id_cols, columns="Model", values="Score", aggfunc="mean")
484
+ .reset_index()
485
+ )
486
+
487
+ # Require both values to exist for a dot
488
+ if model_a not in wide.columns or model_b not in wide.columns:
489
+ return pd.DataFrame()
490
+
491
+ wide = wide.dropna(subset=[model_a, model_b])
492
+
493
+ # Nice โ€œTrackโ€ label for display
494
+ if "assay_type" in wide.columns:
495
+ wide["Track"] = wide["assay_type"].astype(str) + " / " + wide["datasets"].astype(str)
496
+ else:
497
+ wide["Track"] = wide["datasets"].astype(str)
498
+
499
+ # Rename for plotting
500
+ wide = wide.rename(columns={model_a: "Model A", model_b: "Model B"})
501
+
502
+ return wide
503
+
504
+
505
+ def build_violin_df(
506
+ benchmark_name: str,
507
+ selected_species: List[str],
508
+ selected_assays: List[str],
509
+ selected_models: List[str],
510
+ selected_datasets: List[str],
511
+ ) -> pd.DataFrame:
512
+ # Use the same base filtering, but keep all per-track rows
513
+ df = filter_base_df(
514
+ benchmark_name,
515
+ selected_species,
516
+ selected_assays,
517
+ selected_models,
518
+ selected_datasets,
519
+ )
520
+ # Keep only needed columns
521
+ keep = ["Model", "Score"]
522
+ for col in ["species", "assay_type", "datasets"]:
523
+ if col in df.columns:
524
+ keep.append(col)
525
+ return df[keep].copy()
526
+
527
+
528
+ def plot_radar(
529
+ radar_df: pd.DataFrame,
530
+ metric_label: str,
531
+ height: int = 600,
532
+ ):
533
+ if radar_df.empty:
534
+ return None
535
+
536
+ axes = radar_df["Axis"].unique().tolist()
537
+
538
+ # Global radial range
539
+ r_min = radar_df["Value"].min()
540
+ r_max = radar_df["Value"].max()
541
+ pad = 0.05 * (r_max - r_min if r_max > r_min else 1.0)
542
+ r_range = [r_min - pad, r_max + pad]
543
+
544
+ fig = go.Figure()
545
+
546
+ for model in radar_df["Model"].unique():
547
+ sub = radar_df[radar_df["Model"] == model]
548
+
549
+ # Ensure consistent axis ordering
550
+ sub = sub.set_index("Axis").reindex(axes)
551
+
552
+ fig.add_trace(
553
+ go.Scatterpolar(
554
+ r=sub["Value"],
555
+ theta=axes,
556
+ fill="toself",
557
+ name=model,
558
+ line_color=MODEL_COLORS.get(model),
559
+ opacity=0.75,
560
+ )
561
+ )
562
+
563
+ fig.update_layout(
564
+ height=height,
565
+ polar=dict(
566
+ bgcolor="rgba(0,0,0,0)", # ๐Ÿ‘ˆ polar background
567
+ radialaxis=dict(
568
+ title=metric_label,
569
+ range=r_range,
570
+ tickformat=".2f",
571
+ showgrid=True,
572
+ gridcolor="rgba(0,0,0,0.15)", # subtle grid
573
+ ),
574
+ angularaxis=dict(
575
+ showgrid=True,
576
+ gridcolor="rgba(0,0,0,0.15)",
577
+ ),
578
+ ),
579
+ paper_bgcolor="rgba(0,0,0,0)", # ๐Ÿ‘ˆ entire figure background
580
+ plot_bgcolor="rgba(0,0,0,0)", # ๐Ÿ‘ˆ plot area
581
+ showlegend=True,
582
+ legend_title_text="Model",
583
+ margin=dict(t=40, b=40, l=40, r=40),
584
+ )
585
+
586
+
587
+ return fig
588
+
589
+
590
+
591
  # ---------------------------------------------------------------------
592
  # UI helpers
593
  # ---------------------------------------------------------------------
 
630
 
631
  # Species toggles, but only for species present in this benchmark
632
  st.sidebar.subheader("Species")
633
+
634
+ # Toggle: Plants vs Animals
635
+ species_group = st.sidebar.radio(
636
+ "Group",
637
+ options=["Animals", "Plants"],
638
+ index=0,
639
+ horizontal=True,
640
+ key=f"species_group_{benchmark_name}",
641
+ )
642
+
643
+ available_species_all = sorted(df_bench["species"].unique())
644
+ allowed_species = set(SPECIES_GROUPS[species_group]).intersection(available_species_all)
645
+ available_species = sorted(allowed_species)
646
+
647
  selected_species: List[str] = []
648
  for sp in available_species:
649
+ if sidebar_toggle(sp, value=True, key=f"species_{benchmark_name}_{species_group}_{sp}"):
650
  selected_species.append(sp)
651
 
652
+ # (Optional) If no species exist for that group in this benchmark
653
+ if not available_species:
654
+ st.sidebar.info(f"No {species_group.lower()} species available for this benchmark.")
655
+
656
+
657
  # Assay toggles (Pearson only), based on filtered species
658
  if cfg.get("has_assay_type", False):
659
  st.sidebar.subheader("Assay types")
 
673
  else:
674
  selected_assays = []
675
 
676
+
677
  # Bed track / dataset toggles (MCC only), based on species selection
678
  selected_datasets: List[str] = []
679
+ if benchmark_name == "Genome Annotation":
680
  st.sidebar.subheader("Genome annotations")
681
  if selected_species:
682
  df_for_tracks = df_bench[df_bench["species"].isin(selected_species)]
 
708
 
709
  with col1:
710
  st.subheader("๐Ÿ… Leaderboard (per model)")
711
+ st.write("\n") # ๐Ÿ‘ˆ spacer to match plotly padding
712
+ st.write("\n")
713
+ st.write("\n")
714
  if leaderboard_df.empty:
715
  st.info("No data for the selected filters.")
716
  else:
717
  st.dataframe(leaderboard_df, use_container_width=True)
718
 
719
+
720
  with col2:
721
  st.subheader("๐Ÿ“ˆ Mean score per model")
722
  if bar_df.empty:
723
  st.info("No data for the selected filters.")
724
  else:
725
+ # Order models by performance (least -> most)
726
+ bar_df = bar_df.sort_values("Mean score", ascending=True)
727
+
728
+ model_order = bar_df["Model"].tolist()
729
+
730
  fig = px.bar(
731
  bar_df,
732
  x="Model",
733
  y="Mean score",
734
  color="Model",
735
  color_discrete_map=MODEL_COLORS,
736
+ category_orders={"Model": model_order}, # enforce ordering on x
737
  )
738
  fig.update_layout(
739
  barmode="group",
740
  height=500,
741
+ xaxis_title="",
742
  yaxis_title="Mean score",
743
  plot_bgcolor="rgba(0,0,0,0)",
744
  paper_bgcolor="rgba(0,0,0,0)",
745
+ bargap=0.08,
746
  )
747
+
748
+ # Hide x-axis model names (same style as the panels)
749
+ fig.update_xaxes(showticklabels=False)
750
+
751
  st.plotly_chart(fig, use_container_width=True)
752
 
753
 
754
+ # --- Breakdown plot: assay_type (Functional Tracks) OR datasets (Genome Annotation) ---
755
+ breakdown_df = build_category_model_df(
756
+ benchmark_name, selected_species, selected_assays, selected_models, selected_datasets
757
+ )
758
+
759
+ st.subheader("๐Ÿงช Mean score by assay type / dataset (all models)")
760
+ if breakdown_df.empty:
761
+ st.info("No data for the selected filters.")
762
+ else:
763
+ fig_breakdown = plot_breakdown_facets_sorted_models(
764
+ breakdown_df,
765
+ metric_label=cfg["metric_label"],
766
+ height=300,
767
+ )
768
+ st.plotly_chart(fig_breakdown, use_container_width=True)
769
+
770
+ st.subheader("๐Ÿ•ธ๏ธ Performance by assay type / dataset (radar)")
771
+ radar_df = build_radar_df(
772
+ benchmark_name,
773
+ selected_species,
774
+ selected_assays,
775
+ selected_models,
776
+ selected_datasets,
777
+ )
778
+
779
+ if radar_df.empty:
780
+ st.info("No data for the selected filters.")
781
+ else:
782
+ fig_radar = plot_radar(
783
+ radar_df,
784
+ metric_label=cfg["metric_label"],
785
+ )
786
+ st.plotly_chart(fig_radar, use_container_width=True)
787
+
788
+ st.subheader("โš–๏ธ Model comparison")
789
+
790
+ left, right = st.columns([1, 1], gap="large")
791
+
792
+ with left:
793
+ st.markdown("#### Head-to-head (per track)")
794
+
795
+ model_picker_options = selected_models if selected_models else _ALL_MODELS
796
+ default_a = model_picker_options[0] if model_picker_options else _ALL_MODELS[0]
797
+ default_b = model_picker_options[1] if len(model_picker_options) > 1 else (
798
+ _ALL_MODELS[1] if len(_ALL_MODELS) > 1 else default_a
799
+ )
800
+
801
+ cA, cB = st.columns([1, 1])
802
+ with cA:
803
+ model_a = st.selectbox(
804
+ "Model A (y-axis)",
805
+ options=model_picker_options,
806
+ index=model_picker_options.index(default_a) if default_a in model_picker_options else 0,
807
+ key=f"pair_model_a_{benchmark_name}",
808
+ )
809
+ with cB:
810
+ b_options = [m for m in model_picker_options if m != model_a] or model_picker_options
811
+ model_b = st.selectbox(
812
+ "Model B (x-axis)",
813
+ options=b_options,
814
+ index=0,
815
+ key=f"pair_model_b_{benchmark_name}",
816
+ )
817
+
818
+ scatter_df = build_pairwise_scatter_df(
819
+ benchmark_name,
820
+ selected_species,
821
+ selected_assays,
822
+ selected_models,
823
+ selected_datasets,
824
+ model_a,
825
+ model_b,
826
+ )
827
+
828
+ if scatter_df.empty:
829
+ st.info("No overlapping tracks for the selected filters (or one model is missing values).")
830
+ else:
831
+ min_v = float(min(scatter_df["Model A"].min(), scatter_df["Model B"].min()))
832
+ max_v = float(max(scatter_df["Model A"].max(), scatter_df["Model B"].max()))
833
+ pad = 0.05 * (max_v - min_v if max_v > min_v else 1.0)
834
+ axis_range = [min_v - pad, max_v + pad]
835
+ tick_step = (axis_range[1] - axis_range[0]) / 5
836
+
837
+ hover_cols = ["Track"]
838
+ for c in ["species", "assay_type", "datasets"]:
839
+ if c in scatter_df.columns:
840
+ hover_cols.append(c)
841
+
842
+ # Model A on Y, Model B on X
843
+ fig_scatter = px.scatter(
844
+ scatter_df,
845
+ x="Model B",
846
+ y="Model A",
847
+ hover_name="Track",
848
+ hover_data=hover_cols,
849
+ )
850
+
851
+ # Red diagonal y=x
852
+ fig_scatter.add_shape(
853
+ type="line",
854
+ x0=axis_range[0], y0=axis_range[0],
855
+ x1=axis_range[1], y1=axis_range[1],
856
+ xref="x", yref="y",
857
+ line=dict(color="red", dash="dot", width=2),
858
+ )
859
+
860
+ # Square + identical scale/ticks (works even with use_container_width=True)
861
+ fig_scatter.update_layout(
862
+ height=550,
863
+ margin=dict(l=60, r=20, t=20, b=60),
864
+ xaxis=dict(
865
+ title=f"{model_b} โ€” {cfg['metric_label']}",
866
+ range=axis_range,
867
+ dtick=tick_step,
868
+ constrain="domain",
869
+ ),
870
+ yaxis=dict(
871
+ title=f"{model_a} โ€” {cfg['metric_label']}",
872
+ range=axis_range,
873
+ dtick=tick_step,
874
+ scaleanchor="x", # lock y to x
875
+ scaleratio=1,
876
+ constrain="domain",
877
+ ),
878
+ plot_bgcolor="rgba(0,0,0,0)",
879
+ paper_bgcolor="rgba(0,0,0,0)",
880
+ )
881
+
882
+ st.plotly_chart(fig_scatter, use_container_width=True)
883
+
884
+ with right:
885
+ st.markdown("#### All models (distribution across tracks)")
886
+
887
+ violin_df = build_violin_df(
888
+ benchmark_name,
889
+ selected_species,
890
+ selected_assays,
891
+ selected_models,
892
+ selected_datasets,
893
+ )
894
+
895
+ if violin_df.empty:
896
+ st.info("No data for the selected filters.")
897
+ else:
898
+ # Order models by median performance (least -> most)
899
+ model_order = (
900
+ violin_df
901
+ .groupby("Model")["Score"]
902
+ .median()
903
+ .sort_values(ascending=True)
904
+ .index
905
+ .tolist()
906
+ )
907
+
908
+ fig_violin = px.violin(
909
+ violin_df,
910
+ x="Model",
911
+ y="Score",
912
+ color="Model",
913
+ color_discrete_map=MODEL_COLORS,
914
+ box=True, # keep inner boxplot
915
+ points=False, # ๐Ÿ‘ˆ remove all dots
916
+ category_orders={"Model": model_order}, # ๐Ÿ‘ˆ enforce ordering
917
+ )
918
+
919
+ fig_violin.update_layout(
920
+ height=650,
921
+ xaxis_title="",
922
+ yaxis_title=cfg["metric_label"],
923
+ plot_bgcolor="rgba(0,0,0,0)",
924
+ paper_bgcolor="rgba(0,0,0,0)",
925
+ showlegend=False,
926
+ )
927
+
928
+ fig_violin.update_traces(
929
+ box_visible=True,
930
+ meanline_visible=False,
931
+ )
932
+
933
+ # Optional: hide model names if you prefer a cleaner look
934
+ # fig_violin.update_xaxes(showticklabels=False)
935
+
936
+ st.plotly_chart(fig_violin, use_container_width=True)
937
+
938
+
939
+
940
  if __name__ == "__main__":
941
  main()