Spaces:

LukeFP
/

physh_topic_classification

Running

App Files Files Community

LukeFP commited on 3 days ago

Commit

bc9ecb4

1 Parent(s): 261241c

added the folder

Browse files

Files changed (6) hide show

0219_gradio/README.md +41 -0
0219_gradio/__pycache__/app.cpython-313.pyc +0 -0
0219_gradio/app.py +265 -0
0219_gradio/concept_conditioned_gemma_20260130_140842.pt +3 -0
0219_gradio/discipline_classifier_gemma_20260130_140842.pt +3 -0
0219_gradio/requirements.txt +4 -0

0219_gradio/README.md ADDED Viewed

	@@ -0,0 +1,41 @@

+# PhySH Taxonomy Classifier — Gradio App
+Interactive web app that predicts APS PhySH **disciplines** and **research-area concepts**
+for a given paper title + abstract.
+## How it works
+1. Text is embedded with `google/embeddinggemma-300m` (768-dim, L2-normalised).
+2. **Stage 1** — A multi-label MLP predicts discipline probabilities (18 classes).
+3. **Stage 2** — A discipline-conditioned MLP concatenates the embedding with discipline
+   probabilities and predicts research-area concepts (186 classes).
+Both models are `.pt` checkpoints trained in `../0120_taxonomy_training_inference/`.
+## Setup
+The app uses the project-level virtualenv (`.venv` at the repo root).
+```bash
+# From the repo root
+source .venv/bin/activate
+# Install the one extra dependency
+pip install gradio
+```
+## Run
+```bash
+cd 0219_gradio
+python app.py
+```
+Then open `http://127.0.0.1:7860` in your browser.
+## Model files
+The app expects these checkpoints in the same directory as `app.py`:
+- `discipline_classifier_gemma_20260130_140842.pt`
+- `concept_conditioned_gemma_20260130_140842.pt`

0219_gradio/__pycache__/app.cpython-313.pyc ADDED Viewed

Binary file (13.9 kB). View file

0219_gradio/app.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""
+PhySH Taxonomy Classifier — Gradio App
+Two-stage hierarchical cascade:
+  Stage 1  →  Discipline prediction  (18-class multi-label)
+  Stage 2  →  Concept prediction     (186-class multi-label, conditioned on discipline probs)
+Models were trained on APS PhySH labels with google/embeddinggemma-300m embeddings.
+"""
+import re
+from pathlib import Path
+from typing import Dict, List, Tuple
+import gradio as gr
+import numpy as np
+import torch
+import torch.nn as nn
+from sentence_transformers import SentenceTransformer
+# ---------------------------------------------------------------------------
+# Model definitions (mirror the training code exactly)
+# ---------------------------------------------------------------------------
+class MultiLabelMLP(nn.Module):
+    def __init__(self, input_dim: int, output_dim: int,
+                 hidden_layers: Tuple[int, ...] = (1024, 512), dropout: float = 0.3):
+        super().__init__()
+        layers = []
+        prev_dim = input_dim
+        for hidden_dim in hidden_layers:
+            layers.extend([nn.Linear(prev_dim, hidden_dim), nn.ReLU(), nn.Dropout(dropout)])
+            prev_dim = hidden_dim
+        layers.append(nn.Linear(prev_dim, output_dim))
+        self.network = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.network(x)
+class DisciplineConditionedMLP(nn.Module):
+    def __init__(self, embedding_dim: int, discipline_dim: int, output_dim: int,
+                 hidden_layers: Tuple[int, ...] = (1024, 512), dropout: float = 0.3,
+                 discipline_dropout: float = 0.0, use_logits: bool = False):
+        super().__init__()
+        self.use_logits = use_logits
+        self.discipline_dropout = nn.Dropout(discipline_dropout)
+        layers = []
+        prev_dim = embedding_dim + discipline_dim
+        for hidden_dim in hidden_layers:
+            layers.extend([nn.Linear(prev_dim, hidden_dim), nn.ReLU(), nn.Dropout(dropout)])
+            prev_dim = hidden_dim
+        layers.append(nn.Linear(prev_dim, output_dim))
+        self.network = nn.Sequential(*layers)
+    def forward(self, embedding: torch.Tensor, discipline_probs: torch.Tensor) -> torch.Tensor:
+        if self.use_logits:
+            disc_features = torch.clamp(discipline_probs, 1e-7, 1 - 1e-7)
+            disc_features = torch.log(disc_features / (1 - disc_features))
+        else:
+            disc_features = discipline_probs
+        disc_features = self.discipline_dropout(disc_features)
+        return self.network(torch.cat([embedding, disc_features], dim=1))
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+MODELS_DIR = Path(__file__).resolve().parent
+DISCIPLINE_MODEL_PATH = MODELS_DIR / "discipline_classifier_gemma_20260130_140842.pt"
+CONCEPT_MODEL_PATH = MODELS_DIR / "concept_conditioned_gemma_20260130_140842.pt"
+EMBEDDING_MODEL_NAME = "google/embeddinggemma-300m"
+# ---------------------------------------------------------------------------
+# Globals (loaded once at startup)
+# ---------------------------------------------------------------------------
+device: str = "cpu"
+embedding_model: SentenceTransformer = None
+discipline_model: MultiLabelMLP = None
+concept_model: DisciplineConditionedMLP = None
+discipline_labels: List[Dict] = []
+concept_labels: List[Dict] = []
+def load_models():
+    global device, embedding_model, discipline_model, concept_model
+    global discipline_labels, concept_labels
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    else:
+        device = "cpu"
+    print(f"Loading embedding model ({EMBEDDING_MODEL_NAME}) on {device} …")
+    embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=device)
+    # --- discipline model ---
+    disc_ckpt = torch.load(DISCIPLINE_MODEL_PATH, map_location=device, weights_only=False)
+    dc = disc_ckpt["model_config"]
+    discipline_model = MultiLabelMLP(
+        dc["input_dim"], dc["output_dim"],
+        tuple(dc["hidden_layers"]), dc["dropout"],
+    )
+    discipline_model.load_state_dict(disc_ckpt["model_state_dict"])
+    discipline_model.to(device).eval()
+    discipline_labels = disc_ckpt["class_labels"]
+    # --- concept model ---
+    conc_ckpt = torch.load(CONCEPT_MODEL_PATH, map_location=device, weights_only=False)
+    cc = conc_ckpt["model_config"]
+    concept_model = DisciplineConditionedMLP(
+        cc["embedding_dim"], cc["discipline_dim"], cc["output_dim"],
+        tuple(cc["hidden_layers"]), cc["dropout"],
+        cc.get("discipline_dropout", 0.0), cc.get("use_logits", False),
+    )
+    concept_model.load_state_dict(conc_ckpt["model_state_dict"])
+    concept_model.to(device).eval()
+    concept_labels = conc_ckpt["class_labels"]
+    print(f"Loaded {len(discipline_labels)} disciplines, {len(concept_labels)} concepts")
+# ---------------------------------------------------------------------------
+# Prediction
+# ---------------------------------------------------------------------------
+def clean_text(text: str) -> str:
+    if not text:
+        return ""
+    return re.sub(r"\s+", " ", text).strip()
+def predict(title: str, abstract: str, threshold: float, top_k: int):
+    """Run the two-stage cascade and return formatted results."""
+    combined = clean_text(title)
+    abs_clean = clean_text(abstract)
+    if combined and abs_clean:
+        combined = f"{combined} [SEP] {abs_clean}"
+    elif abs_clean:
+        combined = abs_clean
+    if not combined.strip():
+        return "Please enter at least a title or abstract.", ""
+    # Embed
+    embedding = embedding_model.encode(
+        [combined], normalize_embeddings=True, convert_to_numpy=True,
+    )
+    emb_tensor = torch.FloatTensor(embedding).to(device)
+    with torch.no_grad():
+        # Stage 1
+        disc_logits = discipline_model(emb_tensor)
+        disc_probs = torch.sigmoid(disc_logits).cpu().numpy()[0]
+        # Stage 2
+        disc_probs_tensor = torch.FloatTensor(disc_probs).unsqueeze(0).to(device)
+        conc_logits = concept_model(emb_tensor, disc_probs_tensor)
+        conc_probs = torch.sigmoid(conc_logits).cpu().numpy()[0]
+    # Format discipline results
+    disc_order = np.argsort(disc_probs)[::-1]
+    disc_lines = []
+    for rank, idx in enumerate(disc_order[:top_k], 1):
+        prob = disc_probs[idx]
+        label = discipline_labels[idx].get("label", f"Discipline_{idx}")
+        marker = "**" if prob >= threshold else ""
+        disc_lines.append(f"{rank}. {marker}{label}{marker}  —  {prob:.1%}")
+    # Format concept results
+    conc_order = np.argsort(conc_probs)[::-1]
+    conc_lines = []
+    for rank, idx in enumerate(conc_order[:top_k], 1):
+        prob = conc_probs[idx]
+        label = concept_labels[idx].get("label", f"Concept_{idx}")
+        marker = "**" if prob >= threshold else ""
+        conc_lines.append(f"{rank}. {marker}{label}{marker}  —  {prob:.1%}")
+    disc_md = f"### Disciplines (threshold ≥ {threshold:.0%})\n\n" + "\n".join(disc_lines)
+    conc_md = f"### Research-Area Concepts (threshold ≥ {threshold:.0%})\n\n" + "\n".join(conc_lines)
+    return disc_md, conc_md
+# ---------------------------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------------------------
+EXAMPLES = [
+    [
+        "Observation of Gravitational Waves from a Binary Black Hole Merger",
+        "On September 14, 2015 at 09:50:45 UTC the two detectors of the Laser "
+        "Interferometer Gravitational-Wave Observatory simultaneously observed a "
+        "transient gravitational-wave signal. The signal sweeps upwards in frequency "
+        "from 35 to 250 Hz with a peak gravitational-wave strain of 1.0×10⁻²¹.",
+    ],
+    [
+        "Topological Insulators and Superconductors",
+        "Topological insulators are electronic materials that have a bulk band gap "
+        "like an ordinary insulator but have protected conducting states on their "
+        "edge or surface. We review the theoretical foundation for topological "
+        "insulators and superconductors and describe recent experiments.",
+    ],
+    [
+        "Deep Learning for Particle Physics",
+        "We review the application of modern machine learning techniques to the "
+        "analysis of data from high-energy particle physics experiments. Neural "
+        "networks are used for jet tagging, event classification, anomaly detection, "
+        "and fast simulation of detector response.",
+    ],
+]
+def build_app() -> gr.Blocks:
+    with gr.Blocks(
+        title="PhySH Taxonomy Classifier",
+        theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="slate"),
+    ) as demo:
+        gr.Markdown(
+            "# PhySH Taxonomy Classifier\n"
+            "Enter a paper **title** and **abstract** to predict APS PhySH disciplines "
+            "and research-area concepts using a two-stage hierarchical cascade.\n\n"
+            "Labels above the threshold are **bolded**."
+        )
+        with gr.Row():
+            with gr.Column(scale=2):
+                title_box = gr.Textbox(label="Title", lines=2, placeholder="Paper title …")
+                abstract_box = gr.Textbox(label="Abstract", lines=8, placeholder="Paper abstract …")
+                with gr.Row():
+                    threshold_slider = gr.Slider(
+                        minimum=0.05, maximum=0.95, value=0.35, step=0.05,
+                        label="Threshold",
+                    )
+                    topk_slider = gr.Slider(
+                        minimum=1, maximum=20, value=10, step=1, label="Top-K",
+                    )
+                predict_btn = gr.Button("Classify", variant="primary", size="lg")
+            with gr.Column(scale=3):
+                disc_output = gr.Markdown(label="Disciplines")
+                conc_output = gr.Markdown(label="Concepts")
+        predict_btn.click(
+            fn=predict,
+            inputs=[title_box, abstract_box, threshold_slider, topk_slider],
+            outputs=[disc_output, conc_output],
+        )
+        gr.Examples(
+            examples=EXAMPLES,
+            inputs=[title_box, abstract_box],
+            label="Example papers",
+        )
+    return demo
+if __name__ == "__main__":
+    load_models()
+    app = build_app()
+    app.launch()

0219_gradio/concept_conditioned_gemma_20260130_140842.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77da740b38d773acad76a8b1f9d8b4a37a28bcefd3ef1d869564fbbcda7e18d7
+size 5733613

0219_gradio/discipline_classifier_gemma_20260130_140842.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30d46f03c0a5c10d747525096b46c63909a86c40c7a4adc2c5989846c8e4ae61
+size 5291653

0219_gradio/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=5.0,<6.0
+torch>=2.0
+sentence-transformers>=3.0
+numpy